From 9a9c4347d27b70800d377cc7bddb271f31b737aa Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Mon, 15 Jun 2020 16:36:06 +0200 Subject: [PATCH 1/9] Calculate multiple scores in one single run --- .../riskscore/commands/ApplyScoreCommand.java | 23 +- .../java/genepi/riskscore/io/OutputFile.java | 20 +- .../genepi/riskscore/model/RiskScore.java | 15 +- .../riskscore/model/RiskScoreSummary.java | 134 ++++++++++ .../riskscore/tasks/ApplyScoreTask.java | 244 ++++++++---------- .../riskscore/tasks/ApplyScoreTaskTest.java | 206 +++++++++------ 6 files changed, 400 insertions(+), 242 deletions(-) create mode 100644 src/main/java/genepi/riskscore/model/RiskScoreSummary.java diff --git a/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java b/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java index 114cf84..1bdd7b7 100644 --- a/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java +++ b/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java @@ -9,6 +9,7 @@ import genepi.riskscore.io.Chunk; import genepi.riskscore.io.OutputFile; import genepi.riskscore.model.RiskScoreFormat; +import genepi.riskscore.model.RiskScoreSummary; import genepi.riskscore.tasks.ApplyScoreTask; import htsjdk.samtools.util.StopWatch; import picocli.CommandLine.ArgGroup; @@ -80,7 +81,7 @@ public Integer call() throws Exception { System.out.println(); ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFilename(ref); + task.setRiskScoreFilenames(ref); if (chunk != null) { task.setChunk(chunk); } @@ -104,7 +105,7 @@ public Integer call() throws Exception { watch.start(); task.run(); - OutputFile output = new OutputFile(task.getRiskScores()); + OutputFile output = new OutputFile(task.getRiskScores(), task.getSummaries()); output.save(out); watch.stop(); @@ -119,19 +120,11 @@ public Integer call() throws Exception { System.out.println(" - Samples: " + number(task.getCountSamples())); System.out.println(" - Variants: " + number(task.getCountVariants())); System.out.println(); - System.out.println(" Risk Score:"); - System.out.println(" - Variants: " + number(task.getCountVariantsRiskScore())); - System.out.println(" - Variants used: " + number(task.getCountVariantsUsed()) + " (" - + percentage(task.getCountVariantsUsed(), task.getCountVariantsRiskScore()) + ")"); - System.out.println(" - Found in target and filtered by: "); - System.out.println(" - allele mismatch: " + number(task.getCountVariantsAlleleMissmatch())); - System.out.println(" - multi allelic or indels: " + number(task.getCountVariantsMultiAllelic())); - System.out.println(" - low R2 value: " + number(task.getCountVariantsFilteredR2())); - System.out.println(" - variants file: " + number(task.getCountFiltered())); - - int notFound = task.getCountVariantsRiskScore() - - (task.getCountVariantsUsed() + task.getCountFiltered() + task.getCountVariantsAlleleMissmatch() - + task.getCountVariantsMultiAllelic() + task.getCountVariantsFilteredR2()); + + for (RiskScoreSummary summary: task.getSummaries()) { + System.out.println(summary); + System.out.println(); + } // System.out.println(" - Not found in target: " + number(notFound)); System.out.println(); diff --git a/src/main/java/genepi/riskscore/io/OutputFile.java b/src/main/java/genepi/riskscore/io/OutputFile.java index 6f2e2a6..932bcbe 100644 --- a/src/main/java/genepi/riskscore/io/OutputFile.java +++ b/src/main/java/genepi/riskscore/io/OutputFile.java @@ -9,6 +9,7 @@ import genepi.io.table.writer.CsvTableWriter; import genepi.io.table.writer.ITableWriter; import genepi.riskscore.model.RiskScore; +import genepi.riskscore.model.RiskScoreSummary; public class OutputFile { @@ -28,15 +29,26 @@ public OutputFile() { } - public OutputFile(RiskScore[] finalScores) { + public OutputFile(RiskScore[] finalScores, RiskScoreSummary[] summaries) { + + this.scores = new Vector(); + for (RiskScoreSummary summary : summaries) { + scores.add(summary.getName()); + } + samples = new Vector(); - data = new Vector[1]; - data[0] = new Vector(); + data = new Vector[1 + scores.size()]; + for (int i = 0; i < scores.size(); i++) { + data[i] = new Vector(); + } scores = new Vector(); scores.add(COLUMN_SCORE); + this.scores = scores; for (RiskScore riskScore : finalScores) { samples.add(riskScore.getSample()); - data[0].add(riskScore.getScore()); + for (int i = 0; i < scores.size(); i++) { + data[i].add(riskScore.getScore(i)); + } } } diff --git a/src/main/java/genepi/riskscore/model/RiskScore.java b/src/main/java/genepi/riskscore/model/RiskScore.java index 25893b4..5add809 100644 --- a/src/main/java/genepi/riskscore/model/RiskScore.java +++ b/src/main/java/genepi/riskscore/model/RiskScore.java @@ -6,19 +6,20 @@ public class RiskScore { private String chromosome; - private float score = 0; + private float[] scores; - public RiskScore(String chromosome, String sample) { + public RiskScore(String chromosome, String sample, int numberOfScores) { this.chromosome = chromosome; this.sample = sample; + this.scores = new float[numberOfScores]; } - public void setScore(float score) { - this.score = score; + public void setScore(int index, float score) { + this.scores[index] = score; } - public float getScore() { - return score; + public float getScore(int index) { + return scores[index]; } public String getSample() { @@ -31,6 +32,6 @@ public String getChromosome() { @Override public String toString() { - return sample + ": " + score; + return sample + ": " + scores; } } diff --git a/src/main/java/genepi/riskscore/model/RiskScoreSummary.java b/src/main/java/genepi/riskscore/model/RiskScoreSummary.java new file mode 100644 index 0000000..c1f6fe3 --- /dev/null +++ b/src/main/java/genepi/riskscore/model/RiskScoreSummary.java @@ -0,0 +1,134 @@ +package genepi.riskscore.model; + +import java.text.DecimalFormat; + +public class RiskScoreSummary { + + private String name; + + private int variants = 0; + + private int variantsUsed = 0; + + private int variantsSwitched = 0; + + private int variantsMultiAllelic = 0; + + private int variantsAlleleMissmatch = 0; + + private int R2Filtered = 0; + + private int NotFound = 0; + + private int Filtered = 0; + + public RiskScoreSummary(String name) { + this.name = name; + } + + public String getName() { + return name; + } + + public int getVariantsUsed() { + return variantsUsed; + } + + public void incVariantsUsed() { + this.variantsUsed++; + } + + public int getSwitched() { + return variantsSwitched; + } + + public void incSwitched() { + this.variantsSwitched++; + } + + public int getMultiAllelic() { + return variantsMultiAllelic; + } + + public void incMultiAllelic() { + this.variantsMultiAllelic++; + } + + public int getAlleleMissmatch() { + return variantsAlleleMissmatch; + } + + public void incAlleleMissmatch() { + this.variantsAlleleMissmatch++; + } + + public int getR2Filtered() { + return R2Filtered; + } + + public void incR2Filtered() { + this.R2Filtered++; + } + + public int getVariants() { + return variants; + } + + public void setVariants(int count) { + this.variants = count; + } + + public int getNotFound() { + return NotFound; + } + + public void incNotFound() { + this.NotFound++; + } + + public int getFiltered() { + return Filtered; + } + + public void incFiltered() { + this.Filtered++; + } + + public int getVariantsNotUsed() { + return (variants - variantsUsed); + } + + @Override + public String toString() { + + StringBuffer buffer = new StringBuffer(); + + buffer.append(" " + name + ":\n"); + buffer.append(" - Variants: " + number(getVariants()) + "\n"); + buffer.append(" - Variants used: " + number(getVariantsUsed()) + " (" + + percentage(getVariantsUsed(), getVariants()) + ")\n"); + buffer.append(" - Found in target and filtered by:\n"); + buffer.append(" - allele mismatch: " + number(getAlleleMissmatch()) + "\n"); + buffer.append(" - multi allelic or indels: " + number(getMultiAllelic()) + "\n"); + buffer.append(" - low R2 value: " + number(getR2Filtered()) + "\n"); + buffer.append(" - variants file: " + number(getFiltered()) + "\n"); + + int notFound = getVariants() + - (getVariantsUsed() + getFiltered() + getAlleleMissmatch() + getMultiAllelic() + getR2Filtered()); + + return buffer.toString(); + + } + + public static String number(long number) { + DecimalFormat formatter = new DecimalFormat("###,###,###"); + return formatter.format(number); + } + + public static String percentage(double obtained, double total) { + double percentage = (obtained / total) * 100; + DecimalFormat df = new DecimalFormat("###.##'%'"); + return df.format(percentage); + } + +} diff --git a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java index 5ef77dd..1fd309b 100644 --- a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java +++ b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java @@ -14,6 +14,7 @@ import genepi.riskscore.model.ReferenceVariant; import genepi.riskscore.model.RiskScore; import genepi.riskscore.model.RiskScoreFormat; +import genepi.riskscore.model.RiskScoreSummary; public class ApplyScoreTask { @@ -21,30 +22,12 @@ public class ApplyScoreTask { private List vcfs = null; - private String riskScoreFilename = null; + private String riskScoreFilenames[] = null; private int countSamples = 0; private int countVariants = 0; - private int countVariantsUsed = 0; - - private int countVariantsSwitched = 0; - - private int countVariantsMultiAllelic = 0; - - private int countVariantsNotUsed = 0; - - private int countVariantsAlleleMissmatch = 0; - - private int countR2Filtered = 0; - - private int countVariantsRiskScore = 0; - - private int countNotFound = 0; - - private int countFiltered = 0; - private Chunk chunk = null; private float minR2 = 0; @@ -59,12 +42,16 @@ public class ApplyScoreTask { private String genotypeFormat = DOSAGE_FORMAT; + private int numberRiskScores = 0; + + private RiskScoreSummary[] summaries; + public static final String INFO_R2 = "R2"; public static final String DOSAGE_FORMAT = "DS"; - public void setRiskScoreFilename(String filename) { - this.riskScoreFilename = filename; + public void setRiskScoreFilenames(String... filenames) { + this.riskScoreFilenames = filenames; } public void setChunk(Chunk chunk) { @@ -100,8 +87,8 @@ public void run() throws Exception { throw new Exception("Please specify at leat one vcf file."); } - if (riskScoreFilename == null) { - throw new Exception("Reference can not be null."); + if (riskScoreFilenames == null || riskScoreFilenames.length == 0) { + throw new Exception("Reference can not be null or empty."); } long start = System.currentTimeMillis(); @@ -111,23 +98,31 @@ public void run() throws Exception { variantFile.setColumns(new String[] { VariantFile.CHROMOSOME, VariantFile.POSITION }); } + numberRiskScores = riskScoreFilenames.length; + summaries = new RiskScoreSummary[numberRiskScores]; + for (int i = 0; i < numberRiskScores; i++) { + if (i == 0) { + summaries[i] = new RiskScoreSummary("score"); + } else { + summaries[i] = new RiskScoreSummary("score_" + i); + } + } + for (String vcfFilename : vcfs) { - processVCF(vcfFilename, riskScoreFilename); + processVCF(vcfFilename, riskScoreFilenames); } if (variantFile != null) { variantFile.close(); } - countVariantsNotUsed = (countVariantsRiskScore - countVariantsUsed); - long end = System.currentTimeMillis(); System.out.println("Execution Time: " + ((end - start) / 1000.0 / 60.0) + " min"); } - private void processVCF(String vcfFilename, String riskScoreFilename) throws Exception { + private void processVCF(String vcfFilename, String... riskScoreFilenames) throws Exception { // read chromosome from first variant String chromosome = null; @@ -147,22 +142,24 @@ private void processVCF(String vcfFilename, String riskScoreFilename) throws Exc includeVariants.buildIndex(chromosome); System.out.println("Loaded " + includeVariants.getCacheSize() + " variants for chromosome " + chromosome); } + RiskScoreFile[] riskscores = new RiskScoreFile[numberRiskScores]; + for (int i = 0; i < numberRiskScores; i++) { + RiskScoreFile riskscore = new RiskScoreFile(riskScoreFilenames[i], format); + System.out.println("Loading file " + riskScoreFilenames[i] + "..."); + + if (chunk != null) { + riskscore.buildIndex(chromosome, chunk); + } else { + riskscore.buildIndex(chromosome); + } - RiskScoreFile riskscore = new RiskScoreFile(riskScoreFilename, format); - System.out.println("Loading file " + riskScoreFilename + "..."); + summaries[i].setVariants(riskscore.getTotalVariants()); - if (chunk != null) { - riskscore.buildIndex(chromosome, chunk); - } else { - riskscore.buildIndex(chromosome); - } + System.out.println("Loaded " + riskscore.getCacheSize() + " weights for chromosome " + chromosome); - if (countVariantsRiskScore == 0) { - countVariantsRiskScore = riskscore.getTotalVariants(); + System.out.println("Loading file " + vcfFilename + "..."); + riskscores[i] = riskscore; } - System.out.println("Loaded " + riskscore.getCacheSize() + " weights for chromosome " + chromosome); - - System.out.println("Loading file " + vcfFilename + "..."); vcfReader = new FastVCFFileReader(vcfFilename); countSamples = vcfReader.getGenotypedSamples().size(); @@ -170,7 +167,9 @@ private void processVCF(String vcfFilename, String riskScoreFilename) throws Exc if (riskScores == null) { riskScores = new RiskScore[countSamples]; for (int i = 0; i < countSamples; i++) { - riskScores[i] = new RiskScore(chromosome, vcfReader.getGenotypedSamples().get(i)); + riskScores[i] = new RiskScore(chromosome, vcfReader.getGenotypedSamples().get(i), + riskScoreFilenames.length); + } } else { if (riskScores.length != countSamples) { @@ -193,88 +192,94 @@ private void processVCF(String vcfFilename, String riskScoreFilename) throws Exc int position = variant.getStart(); - boolean isPartOfRiskScore = riskscore.contains(position); + for (int j = 0; j < riskScoreFilenames.length; j++) { - if (!isPartOfRiskScore) { - countNotFound++; - continue; - } + RiskScoreSummary summary = summaries[j]; + + RiskScoreFile riskscore = riskscores[j]; + boolean isPartOfRiskScore = riskscore.contains(position); - if (includeVariants != null) { - if (!includeVariants.contains(position)) { - countFiltered++; + if (!isPartOfRiskScore) { + summary.incNotFound(); continue; } - } - // Imputation Quality Filter - double r2 = variant.getInfoAsDouble(INFO_R2, 0); - if (r2 < minR2) { - countR2Filtered++; - continue; - } + if (includeVariants != null) { + if (!includeVariants.contains(position)) { + summary.incFiltered(); + continue; + } + } - ReferenceVariant referenceVariant = riskscore.getVariant(position); + // Imputation Quality Filter + double r2 = variant.getInfoAsDouble(INFO_R2, 0); + if (r2 < minR2) { + summary.incR2Filtered(); + continue; + } - if (variant.isComplexIndel()) { - countVariantsMultiAllelic++; - continue; - } + ReferenceVariant referenceVariant = riskscore.getVariant(position); - float effectWeight = -referenceVariant.getEffectWeight(); + if (variant.isComplexIndel()) { + summary.incMultiAllelic(); + continue; + } - char referenceAllele = variant.getReferenceAllele().charAt(0); + float effectWeight = -referenceVariant.getEffectWeight(); - // ignore deletions - if (variant.getAlternateAllele().length() == 0) { - countVariantsMultiAllelic++; - continue; - } + char referenceAllele = variant.getReferenceAllele().charAt(0); - char alternateAllele = variant.getAlternateAllele().charAt(0); + // ignore deletions + if (variant.getAlternateAllele().length() == 0) { + summary.incMultiAllelic(); + continue; + } - if (!referenceVariant.hasAllele(referenceAllele) || !referenceVariant.hasAllele(alternateAllele)) { - countVariantsAlleleMissmatch++; - continue; - } + char alternateAllele = variant.getAlternateAllele().charAt(0); - if (!referenceVariant.isEffectAllele(referenceAllele)) { - effectWeight = -effectWeight; - countVariantsSwitched++; - } + if (!referenceVariant.hasAllele(referenceAllele) || !referenceVariant.hasAllele(alternateAllele)) { + summary.incAlleleMissmatch(); + continue; + } - if (variantFile != null) { - variantFile.setString(VariantFile.CHROMOSOME, variant.getContig()); - variantFile.setInteger(VariantFile.POSITION, variant.getStart()); - variantFile.next(); - } + if (!referenceVariant.isEffectAllele(referenceAllele)) { + effectWeight = -effectWeight; + summary.incSwitched(); + } - String[] values = variant.getGenotypes(genotypeFormat); + if (variantFile != null) { + variantFile.setString(VariantFile.CHROMOSOME, variant.getContig()); + variantFile.setInteger(VariantFile.POSITION, variant.getStart()); + variantFile.next(); + } - for (int i = 0; i < countSamples; i++) { - float dosage = 0; - // genotypes - if (values[i].equals("0|0")) { - dosage = 0; - } else if (values[i].equals("0|1") || values[i].equals("1|0")) { - dosage = 1; - } else if (values[i].equals("1|1")) { - dosage = 2; - } else { - // dosage - dosage = Float.parseFloat(values[i]); + String[] values = variant.getGenotypes(genotypeFormat); + + for (int i = 0; i < countSamples; i++) { + float dosage = 0; + // genotypes + if (values[i].equals("0|0")) { + dosage = 0; + } else if (values[i].equals("0|1") || values[i].equals("1|0")) { + dosage = 1; + } else if (values[i].equals("1|1")) { + dosage = 2; + } else { + // dosage + dosage = Float.parseFloat(values[i]); + } + float score = riskScores[i].getScore(j) + (dosage * effectWeight); + riskScores[i].setScore(j, score); } - float score = riskScores[i].getScore() + (dosage * effectWeight); - riskScores[i].setScore(score); - } - countVariantsUsed++; + summary.incVariantsUsed(); + } } vcfReader.close(); - System.out.println("Loaded " + getRiskScores().length + " samples and " + getCountVariants() + " variants."); + System.out.println("Loaded " + getRiskScores().length + " samples and " + countVariants + " variants."); } @@ -294,44 +299,11 @@ public RiskScore[] getRiskScores() { return riskScores; } - public int getCountVariants() { - return countVariants; - } - - public int getCountVariantsUsed() { - return countVariantsUsed; - } - - public int getCountVariantsSwitched() { - return countVariantsSwitched; - } - - public int getCountVariantsMultiAllelic() { - return countVariantsMultiAllelic; - } - - public int getCountVariantsNotUsed() { - return countVariantsNotUsed; - } - - public int getCountVariantsAlleleMissmatch() { - return countVariantsAlleleMissmatch; - } - - public int getCountVariantsFilteredR2() { - return countR2Filtered; - } - - public int getCountVariantsRiskScore() { - return countVariantsRiskScore; - } - - public int getCountVariantsNotFound() { - return countNotFound; + public RiskScoreSummary[] getSummaries() { + return summaries; } - public int getCountFiltered() { - return countFiltered; + public int getCountVariants() { + return countVariants; } - } diff --git a/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java b/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java index 29ffead..3af3c24 100644 --- a/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java +++ b/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java @@ -8,6 +8,7 @@ import genepi.riskscore.io.VariantFile; import genepi.riskscore.model.RiskScore; import genepi.riskscore.model.RiskScoreFormat; +import genepi.riskscore.model.RiskScoreSummary; public class ApplyScoreTaskTest { @@ -19,15 +20,17 @@ public void testPerformance() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/chr20.dose.vcf.gz"); - task.setRiskScoreFilename("test-data/chr20.scores.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.run(); assertEquals(63480, task.getCountVariants()); - assertEquals(3, task.getCountVariantsUsed()); - assertEquals(1, task.getCountVariantsSwitched()); - assertEquals(1, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(0, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(3, summary.getVariantsUsed()); + assertEquals(1, summary.getSwitched()); + assertEquals(1, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); assertEquals(EXPECTED_SAMPLES, task.getCountSamples()); } @@ -38,38 +41,66 @@ public void testMultiPostion() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/small.vcf"); - task.setRiskScoreFilename("test-data/chr20.scores.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.run(); assertEquals(4, task.getCountVariants()); - assertEquals(3, task.getCountVariantsUsed()); - assertEquals(1, task.getCountVariantsSwitched()); - assertEquals(1, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(1, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(3, summary.getVariantsUsed()); + assertEquals(1, summary.getSwitched()); + assertEquals(1, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(1, summary.getAlleleMissmatch()); assertEquals(EXPECTED_SAMPLES, task.getCountSamples()); assertEquals(EXPECTED_SAMPLES, task.getRiskScores().length); } + @Test + public void testMultipleScores() throws Exception { + + ApplyScoreTask task = new ApplyScoreTask(); + task.setRiskScoreFormat(new RiskScoreFormat()); + task.setVcfFilenames("test-data/chr20.dose.vcf.gz"); + task.setRiskScoreFilenames("test-data/chr20.scores.csv", "test-data/chr20.scores.csv", + "test-data/chr20.scores.csv"); + task.run(); + + assertEquals(63480, task.getCountVariants()); + + assertEquals(3, task.getSummaries().length); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(3, summary.getVariantsUsed()); + assertEquals(1, summary.getSwitched()); + assertEquals(1, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); + assertEquals(EXPECTED_SAMPLES, task.getCountSamples()); + + } + @Test public void testScore() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/single.vcf"); - task.setRiskScoreFilename("test-data/chr20.scores.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.run(); assertEquals(5, task.getCountVariants()); - assertEquals(3, task.getCountVariantsUsed()); - assertEquals(1, task.getCountVariantsSwitched()); - assertEquals(1, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(1, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(3, summary.getVariantsUsed()); + assertEquals(1, summary.getSwitched()); + assertEquals(1, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(1, summary.getAlleleMissmatch()); assertEquals(1, task.getCountSamples()); assertEquals(1, task.getRiskScores().length); assertEquals("LF001", task.getRiskScores()[0].getSample()); - assertEquals(-0.4, task.getRiskScores()[0].getScore(), 0.00001); + assertEquals(-0.4, task.getRiskScores()[0].getScore(0), 0.00001); } @@ -79,19 +110,21 @@ public void testScoreSwitchEffectAllele() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/single.vcf"); - task.setRiskScoreFilename("test-data/chr20.scores.2.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.run(); assertEquals(5, task.getCountVariants()); - assertEquals(3, task.getCountVariantsUsed()); - assertEquals(0, task.getCountVariantsSwitched()); - assertEquals(1, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(1, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(3, summary.getVariantsUsed()); + assertEquals(0, summary.getSwitched()); + assertEquals(1, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(1, summary.getAlleleMissmatch()); assertEquals(1, task.getCountSamples()); assertEquals(1, task.getRiskScores().length); assertEquals("LF001", task.getRiskScores()[0].getSample()); - assertEquals(-0.6, task.getRiskScores()[0].getScore(), 0.00001); + assertEquals(-0.6, task.getRiskScores()[0].getScore(0), 0.00001); } @Test @@ -100,21 +133,23 @@ public void testMinR2_06() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/two.vcf"); - task.setRiskScoreFilename("test-data/chr20.scores.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.setMinR2(0.6f); task.run(); assertEquals(5, task.getCountVariants()); - assertEquals(1, task.getCountVariantsUsed()); - assertEquals(0, task.getCountVariantsSwitched()); - assertEquals(3, task.getCountVariantsNotUsed()); - assertEquals(3, task.getCountVariantsFilteredR2()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(0, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(1, summary.getVariantsUsed()); + assertEquals(0, summary.getSwitched()); + assertEquals(3, summary.getVariantsNotUsed()); + assertEquals(3, summary.getR2Filtered()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); assertEquals(2, task.getCountSamples()); assertEquals(2, task.getRiskScores().length); assertEquals("LF001", task.getRiskScores()[0].getSample()); - assertEquals(-0.2, task.getRiskScores()[0].getScore(), 0.00001); + assertEquals(-0.2, task.getRiskScores()[0].getScore(0), 0.00001); } @@ -124,21 +159,23 @@ public void testMinR2_05() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/two.vcf"); - task.setRiskScoreFilename("test-data/chr20.scores.2.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.setMinR2(0.5f); task.run(); assertEquals(5, task.getCountVariants()); - assertEquals(2, task.getCountVariantsUsed()); - assertEquals(0, task.getCountVariantsSwitched()); - assertEquals(2, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(0, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(2, summary.getVariantsUsed()); + assertEquals(0, summary.getSwitched()); + assertEquals(2, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); assertEquals(2, task.getCountSamples()); - assertEquals(2, task.getCountVariantsFilteredR2()); + assertEquals(2, summary.getR2Filtered()); assertEquals(2, task.getRiskScores().length); assertEquals("LF001", task.getRiskScores()[0].getSample()); - assertEquals(-0.3, task.getRiskScores()[0].getScore(), 0.00001); + assertEquals(-0.3, task.getRiskScores()[0].getScore(0), 0.00001); } @@ -148,21 +185,23 @@ public void testMinR2_1() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/two.vcf"); - task.setRiskScoreFilename("test-data/chr20.scores.2.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.setMinR2(1f); task.run(); assertEquals(5, task.getCountVariants()); - assertEquals(0, task.getCountVariantsUsed()); - assertEquals(0, task.getCountVariantsSwitched()); - assertEquals(4, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(0, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(0, summary.getVariantsUsed()); + assertEquals(0, summary.getSwitched()); + assertEquals(4, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); assertEquals(2, task.getCountSamples()); - assertEquals(4, task.getCountVariantsFilteredR2()); + assertEquals(4, summary.getR2Filtered()); assertEquals(2, task.getRiskScores().length); assertEquals("LF001", task.getRiskScores()[0].getSample()); - assertEquals(0.0, task.getRiskScores()[0].getScore(), 0.00001); + assertEquals(0.0, task.getRiskScores()[0].getScore(0), 0.00001); } @@ -172,28 +211,30 @@ public void testMultipleFiles() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.vcf"); - task.setRiskScoreFilename("test-data/test.scores.csv"); + task.setRiskScoreFilenames("test-data/test.scores.csv"); task.run(); assertEquals(10, task.getCountVariants()); - assertEquals(11, task.getCountVariantsRiskScore()); - assertEquals(7, task.getCountVariantsUsed()); - assertEquals(4, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsSwitched()); - assertEquals(0, task.getCountVariantsFilteredR2()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(0, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(11, summary.getVariants()); + assertEquals(7, summary.getVariantsUsed()); + assertEquals(4, summary.getVariantsNotUsed()); + assertEquals(0, summary.getSwitched()); + assertEquals(0, summary.getR2Filtered()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); assertEquals(2, task.getCountSamples()); assertEquals(2, task.getRiskScores().length); RiskScore first = task.getRiskScores()[0]; assertEquals("LF001", first.getSample()); - assertEquals(-(1 + 3), first.getScore(), 0.0000001); + assertEquals(-(1 + 3), first.getScore(0), 0.0000001); RiskScore second = task.getRiskScores()[1]; assertEquals("LF002", second.getSample()); - assertEquals(-(3 + 7), second.getScore(), 0.0000001); + assertEquals(-(3 + 7), second.getScore(0), 0.0000001); } @@ -203,13 +244,15 @@ public void testWriteVariantFile() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.vcf"); - task.setRiskScoreFilename("test-data/test.scores.csv"); + task.setRiskScoreFilenames("test-data/test.scores.csv"); task.setOutputVariantFilename("variants.txt"); task.run(); assertEquals(10, task.getCountVariants()); - assertEquals(11, task.getCountVariantsRiskScore()); - assertEquals(7, task.getCountVariantsUsed()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(11, summary.getVariants()); + assertEquals(7, summary.getVariantsUsed()); VariantFile variants = new VariantFile("variants.txt"); variants.buildIndex("1"); @@ -221,24 +264,26 @@ public void testWriteVariantFile() throws Exception { assertEquals(3, variants.getCacheSize()); } - + @Test public void testReadVariantsFile() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.vcf"); - task.setRiskScoreFilename("test-data/test.scores.csv"); + task.setRiskScoreFilenames("test-data/test.scores.csv"); task.setIncludeVariantFilename("test-data/variants.txt"); task.run(); assertEquals(10, task.getCountVariants()); - assertEquals(11, task.getCountVariantsRiskScore()); - assertEquals(5, task.getCountVariantsUsed()); - assertEquals(0, task.getCountVariantsSwitched()); - assertEquals(0, task.getCountVariantsFilteredR2()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(0, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(11, summary.getVariants()); + assertEquals(5, summary.getVariantsUsed()); + assertEquals(0, summary.getSwitched()); + assertEquals(0, summary.getR2Filtered()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); assertEquals(2, task.getCountSamples()); assertEquals(2, task.getRiskScores().length); @@ -250,7 +295,7 @@ public void testWrongChromosome() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/single.wrong_chr.vcf"); - task.setRiskScoreFilename("test-data/chr20.scores.2.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.setMinR2(1f); task.run(); @@ -262,7 +307,7 @@ public void testDifferentSamples() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.wrong.vcf"); - task.setRiskScoreFilename("test-data/test.scores.csv"); + task.setRiskScoreFilenames("test-data/test.scores.csv"); task.setMinR2(1f); task.run(); @@ -274,7 +319,7 @@ public void testWithChunk() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); task.setRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/chr20.dose.vcf.gz"); - task.setRiskScoreFilename("test-data/chr20.scores.csv"); + task.setRiskScoreFilenames("test-data/chr20.scores.csv"); Chunk chunk = new Chunk(); chunk.setStart(61795); chunk.setEnd(63231); @@ -282,14 +327,15 @@ public void testWithChunk() throws Exception { task.run(); assertEquals(63480, task.getCountVariants()); - assertEquals(2, task.getCountVariantsUsed()); - assertEquals(1, task.getCountVariantsSwitched()); - assertEquals(2, task.getCountVariantsNotUsed()); - assertEquals(0, task.getCountVariantsMultiAllelic()); - assertEquals(0, task.getCountVariantsAlleleMissmatch()); + + RiskScoreSummary summary = task.getSummaries()[0]; + assertEquals(2, summary.getVariantsUsed()); + assertEquals(1, summary.getSwitched()); + assertEquals(2, summary.getVariantsNotUsed()); + assertEquals(0, summary.getMultiAllelic()); + assertEquals(0, summary.getAlleleMissmatch()); assertEquals(EXPECTED_SAMPLES, task.getCountSamples()); } - } From 242b65999e8928055c7f53a8d0c9f3578116c39f Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Mon, 15 Jun 2020 17:15:16 +0200 Subject: [PATCH 2/9] Refactore variant context and parsing --- .../io/vcf/MinimalVariantContext.java | 46 +++++++++++++------ .../riskscore/tasks/ApplyScoreTask.java | 15 +----- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java b/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java index 081d02a..f9ef44d 100644 --- a/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java +++ b/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java @@ -32,19 +32,19 @@ public class MinimalVariantContext { private boolean[] genotypes; - private String[] genotypesParsed; - + private float[] genotypesParsed; + private String id = null; private String genotype = null; private String info = null; - + private Map infos; - + public MinimalVariantContext(int samples) { genotypes = new boolean[samples]; - genotypesParsed = new String[samples]; + genotypesParsed = new float[samples]; } public int getHetCount() { @@ -199,20 +199,20 @@ public String toString() { } return id; } - + public void setInfo(String info) { this.info = info; infos = null; } - + public String getInfo(String key) { - //lazy loadding + // lazy loadding if (infos == null) { infos = new HashMap(); String[] tiles = this.info.split(";"); for (int i = 0; i < tiles.length; i++) { String[] tiles2 = tiles[i].split("="); - if(tiles2.length == 2) { + if (tiles2.length == 2) { infos.put(tiles2[0], tiles2[1]); } } @@ -228,8 +228,11 @@ public double getInfoAsDouble(String key, double defaultValue) { return defaultValue; } } - - public String[] getGenotypes(String field) throws IOException { + + public float[] getGenotypeDosages(String field) throws IOException { + + //TODO: implement dirty flag, and parse genotypes only once + String tiles[] = rawLine.split("\t", 10); String[] formats = tiles[8].split(":"); int index = -1; @@ -238,7 +241,7 @@ public String[] getGenotypes(String field) throws IOException { index = i; } } - + if (index == -1) { throw new IOException("field '" + field + "' not found in FORMAT"); } @@ -246,9 +249,24 @@ public String[] getGenotypes(String field) throws IOException { for (int i = 0; i < genotypesParsed.length; i++) { String value = values[i]; String[] tiles2 = value.split(":"); - genotypesParsed[i] = tiles2[index]; + String genotype = tiles2[index]; + + float dosage = 0; + // genotypes + if (genotype.equals("0|0")) { + dosage = 0; + } else if (genotype.equals("0|1") || genotype.equals("1|0")) { + dosage = 1; + } else if (genotype.equals("1|1")) { + dosage = 2; + } else { + // dosage + dosage = Float.parseFloat(genotype); + } + genotypesParsed[i] = dosage; + } return genotypesParsed; } - + } diff --git a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java index 1fd309b..fd80e6c 100644 --- a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java +++ b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java @@ -253,21 +253,10 @@ private void processVCF(String vcfFilename, String... riskScoreFilenames) throws variantFile.next(); } - String[] values = variant.getGenotypes(genotypeFormat); + float[] dosages = variant.getGenotypeDosages(genotypeFormat); for (int i = 0; i < countSamples; i++) { - float dosage = 0; - // genotypes - if (values[i].equals("0|0")) { - dosage = 0; - } else if (values[i].equals("0|1") || values[i].equals("1|0")) { - dosage = 1; - } else if (values[i].equals("1|1")) { - dosage = 2; - } else { - // dosage - dosage = Float.parseFloat(values[i]); - } + float dosage = dosages[i]; float score = riskScores[i].getScore(j) + (dosage * effectWeight); riskScores[i].setScore(j, score); } From 9683f80b1134301a92e12ae97b5352f42736575f Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Mon, 15 Jun 2020 17:38:02 +0200 Subject: [PATCH 3/9] Add multiple score files to command --- .../riskscore/commands/ApplyScoreCommand.java | 41 +++++++++----- .../java/genepi/riskscore/io/OutputFile.java | 8 ++- .../riskscore/tasks/ApplyScoreTask.java | 27 ++++++++-- .../commands/ApplyScoreCommandTest.java | 53 ++++++++++++++++--- 4 files changed, 100 insertions(+), 29 deletions(-) diff --git a/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java b/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java index 1bdd7b7..b9b20f8 100644 --- a/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java +++ b/src/main/java/genepi/riskscore/commands/ApplyScoreCommand.java @@ -81,23 +81,30 @@ public Integer call() throws Exception { System.out.println(); ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFilenames(ref); + + String[] refs = parseRef(ref); + task.setRiskScoreFilenames(refs); + if (format != null) { + RiskScoreFormat riskScoreFormat = RiskScoreFormat.load(format); + for (String file : refs) { + task.setRiskScoreFormat(file, riskScoreFormat); + } + } else { + for (String file : refs) { + String autoFormat = file + ".format"; + if (new File(autoFormat).exists()) { + RiskScoreFormat riskScoreFormat = RiskScoreFormat.load(autoFormat); + task.setRiskScoreFormat(file, riskScoreFormat); + } + } + } + if (chunk != null) { task.setChunk(chunk); } task.setVcfFilenames(vcfs); task.setMinR2(minR2); task.setGenotypeFormat(genotypeFormat); - if (format != null) { - RiskScoreFormat riskScoreFormat = RiskScoreFormat.load(format); - task.setRiskScoreFormat(riskScoreFormat); - } else { - String autoFormat = ref + ".format"; - if (new File(autoFormat).exists()) { - RiskScoreFormat riskScoreFormat = RiskScoreFormat.load(autoFormat); - task.setRiskScoreFormat(riskScoreFormat); - } - } task.setOutputVariantFilename(outputVariantFilename); task.setIncludeVariantFilename(includeVariantFilename); @@ -120,8 +127,8 @@ public Integer call() throws Exception { System.out.println(" - Samples: " + number(task.getCountSamples())); System.out.println(" - Variants: " + number(task.getCountVariants())); System.out.println(); - - for (RiskScoreSummary summary: task.getSummaries()) { + + for (RiskScoreSummary summary : task.getSummaries()) { System.out.println(summary); System.out.println(); } @@ -134,6 +141,14 @@ public Integer call() throws Exception { } + private String[] parseRef(String ref) { + String[] refs = ref.split(","); + for (int i = 0; i < refs.length; i++) { + refs[i] = refs[i].trim(); + } + return refs; + } + public static String number(long number) { DecimalFormat formatter = new DecimalFormat("###,###,###"); return formatter.format(number); diff --git a/src/main/java/genepi/riskscore/io/OutputFile.java b/src/main/java/genepi/riskscore/io/OutputFile.java index 932bcbe..3545503 100644 --- a/src/main/java/genepi/riskscore/io/OutputFile.java +++ b/src/main/java/genepi/riskscore/io/OutputFile.java @@ -31,19 +31,17 @@ public OutputFile() { public OutputFile(RiskScore[] finalScores, RiskScoreSummary[] summaries) { - this.scores = new Vector(); + scores = new Vector(); for (RiskScoreSummary summary : summaries) { scores.add(summary.getName()); } samples = new Vector(); - data = new Vector[1 + scores.size()]; + data = new Vector[scores.size()]; for (int i = 0; i < scores.size(); i++) { data[i] = new Vector(); } - scores = new Vector(); - scores.add(COLUMN_SCORE); - this.scores = scores; + for (RiskScore riskScore : finalScores) { samples.add(riskScore.getSample()); for (int i = 0; i < scores.size(); i++) { diff --git a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java index fd80e6c..642c367 100644 --- a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java +++ b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java @@ -1,7 +1,9 @@ package genepi.riskscore.tasks; import java.io.IOException; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Vector; import genepi.io.table.writer.CsvTableWriter; @@ -38,7 +40,9 @@ public class ApplyScoreTask { private CsvTableWriter variantFile; - private RiskScoreFormat format = new PGSCatalogFormat(); + private RiskScoreFormat defaultFormat = new PGSCatalogFormat(); + + private Map formats = new HashMap(); private String genotypeFormat = DOSAGE_FORMAT; @@ -52,6 +56,9 @@ public class ApplyScoreTask { public void setRiskScoreFilenames(String... filenames) { this.riskScoreFilenames = filenames; + for (String filename : filenames) { + formats.put(filename, defaultFormat); + } } public void setChunk(Chunk chunk) { @@ -144,6 +151,7 @@ private void processVCF(String vcfFilename, String... riskScoreFilenames) throws } RiskScoreFile[] riskscores = new RiskScoreFile[numberRiskScores]; for (int i = 0; i < numberRiskScores; i++) { + RiskScoreFormat format = formats.get(riskScoreFilenames[i]); RiskScoreFile riskscore = new RiskScoreFile(riskScoreFilenames[i], format); System.out.println("Loading file " + riskScoreFilenames[i] + "..."); @@ -156,11 +164,11 @@ private void processVCF(String vcfFilename, String... riskScoreFilenames) throws summaries[i].setVariants(riskscore.getTotalVariants()); System.out.println("Loaded " + riskscore.getCacheSize() + " weights for chromosome " + chromosome); - - System.out.println("Loading file " + vcfFilename + "..."); riskscores[i] = riskscore; } + System.out.println("Loading file " + vcfFilename + "..."); + vcfReader = new FastVCFFileReader(vcfFilename); countSamples = vcfReader.getGenotypedSamples().size(); @@ -276,8 +284,17 @@ public void setMinR2(float minR2) { this.minR2 = minR2; } - public void setRiskScoreFormat(RiskScoreFormat format) { - this.format = format; + public void setRiskScoreFormat(RiskScoreFormat defaultFormat) { + this.defaultFormat = defaultFormat; + if (riskScoreFilenames != null) { + for (String file : riskScoreFilenames) { + setRiskScoreFormat(file, defaultFormat); + } + } + } + + public void setRiskScoreFormat(String file, RiskScoreFormat format) { + this.formats.put(file, format); } public int getCountSamples() { diff --git a/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java b/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java index f2fb925..e87f88e 100644 --- a/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java +++ b/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java @@ -28,7 +28,46 @@ public void testCall() { } assertEquals(EXPECTED_SAMPLES, samples); + reader.close(); + } + + @Test + public void testCallWithMultipleScores() { + + String[] args = { "test-data/test.chr1.vcf", "test-data/test.chr2.vcf", "--ref", + "test-data/test.scores.csv,test-data/test.scores.csv,test-data/test.scores.csv", "--out", + "output.csv" }; + int result = new CommandLine(new ApplyScoreCommand()).execute(args); + assertEquals(0, result); + int samples = 0; + ITableReader reader = new CsvTableReader("output.csv", ','); + while (reader.next()) { + samples++; + + } + assertEquals(4, reader.getColumns().length); + assertEquals(2, samples); + reader.close(); + + reader = new CsvTableReader("output.csv", ','); + + assertEquals(true, reader.next()); + + double score = reader.getDouble("score"); + String sample = reader.getString("sample"); + assertEquals("LF001", sample); + assertEquals(-(1 + 3), score, 0.0000001); + + assertEquals(true, reader.next()); + + score = reader.getDouble("score"); + sample = reader.getString("sample"); + assertEquals("LF002", sample); + assertEquals(-(3 + 7), score, 0.0000001); + + assertEquals(false, reader.next()); + reader.close(); } @Test @@ -122,11 +161,12 @@ public void testCallWithMissingVcf() { assertEquals(1, result); } - + @Test public void testCallWithChunk() { - String[] args = { "test-data/chr20.dose.vcf.gz", "--ref", "test-data/chr20.scores.csv", "--out", "output.csv", "--start", "61795", "--end", "63231" }; + String[] args = { "test-data/chr20.dose.vcf.gz", "--ref", "test-data/chr20.scores.csv", "--out", "output.csv", + "--start", "61795", "--end", "63231" }; int result = new CommandLine(new ApplyScoreCommand()).execute(args); assertEquals(0, result); @@ -137,16 +177,17 @@ public void testCallWithChunk() { } assertEquals(EXPECTED_SAMPLES, samples); - + reader.close(); } - + @Test public void testCallWithStartOnly() { - String[] args = { "test-data/chr20.dose.vcf.gz", "--ref", "test-data/chr20.scores.csv", "--out", "output.csv", "--start", "61795"}; + String[] args = { "test-data/chr20.dose.vcf.gz", "--ref", "test-data/chr20.scores.csv", "--out", "output.csv", + "--start", "61795" }; int result = new CommandLine(new ApplyScoreCommand()).execute(args); assertEquals(2, result); - + } } From 5e237fe0febf469660d513549282051e1de56037 Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Mon, 15 Jun 2020 17:39:28 +0200 Subject: [PATCH 4/9] Refactore default risk score format --- .../riskscore/tasks/ApplyScoreTask.java | 2 +- .../riskscore/tasks/ApplyScoreTaskTest.java | 28 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java index 642c367..7ee9f5e 100644 --- a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java +++ b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java @@ -284,7 +284,7 @@ public void setMinR2(float minR2) { this.minR2 = minR2; } - public void setRiskScoreFormat(RiskScoreFormat defaultFormat) { + public void setDefaultRiskScoreFormat(RiskScoreFormat defaultFormat) { this.defaultFormat = defaultFormat; if (riskScoreFilenames != null) { for (String file : riskScoreFilenames) { diff --git a/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java b/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java index 3af3c24..f3c2c74 100644 --- a/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java +++ b/src/test/java/genepi/riskscore/tasks/ApplyScoreTaskTest.java @@ -18,7 +18,7 @@ public class ApplyScoreTaskTest { public void testPerformance() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/chr20.dose.vcf.gz"); task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.run(); @@ -39,7 +39,7 @@ public void testPerformance() throws Exception { public void testMultiPostion() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/small.vcf"); task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.run(); @@ -60,7 +60,7 @@ public void testMultiPostion() throws Exception { public void testMultipleScores() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/chr20.dose.vcf.gz"); task.setRiskScoreFilenames("test-data/chr20.scores.csv", "test-data/chr20.scores.csv", "test-data/chr20.scores.csv"); @@ -84,7 +84,7 @@ public void testMultipleScores() throws Exception { public void testScore() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/single.vcf"); task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.run(); @@ -108,7 +108,7 @@ public void testScore() throws Exception { public void testScoreSwitchEffectAllele() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/single.vcf"); task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.run(); @@ -131,7 +131,7 @@ public void testScoreSwitchEffectAllele() throws Exception { public void testMinR2_06() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/two.vcf"); task.setRiskScoreFilenames("test-data/chr20.scores.csv"); task.setMinR2(0.6f); @@ -157,7 +157,7 @@ public void testMinR2_06() throws Exception { public void testMinR2_05() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/two.vcf"); task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.setMinR2(0.5f); @@ -183,7 +183,7 @@ public void testMinR2_05() throws Exception { public void testMinR2_1() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/two.vcf"); task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.setMinR2(1f); @@ -209,7 +209,7 @@ public void testMinR2_1() throws Exception { public void testMultipleFiles() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.vcf"); task.setRiskScoreFilenames("test-data/test.scores.csv"); task.run(); @@ -242,7 +242,7 @@ public void testMultipleFiles() throws Exception { public void testWriteVariantFile() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.vcf"); task.setRiskScoreFilenames("test-data/test.scores.csv"); task.setOutputVariantFilename("variants.txt"); @@ -269,7 +269,7 @@ public void testWriteVariantFile() throws Exception { public void testReadVariantsFile() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.vcf"); task.setRiskScoreFilenames("test-data/test.scores.csv"); task.setIncludeVariantFilename("test-data/variants.txt"); @@ -293,7 +293,7 @@ public void testReadVariantsFile() throws Exception { public void testWrongChromosome() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/single.wrong_chr.vcf"); task.setRiskScoreFilenames("test-data/chr20.scores.2.csv"); task.setMinR2(1f); @@ -305,7 +305,7 @@ public void testWrongChromosome() throws Exception { public void testDifferentSamples() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/test.chr1.vcf", "test-data/test.chr2.wrong.vcf"); task.setRiskScoreFilenames("test-data/test.scores.csv"); task.setMinR2(1f); @@ -317,7 +317,7 @@ public void testDifferentSamples() throws Exception { public void testWithChunk() throws Exception { ApplyScoreTask task = new ApplyScoreTask(); - task.setRiskScoreFormat(new RiskScoreFormat()); + task.setDefaultRiskScoreFormat(new RiskScoreFormat()); task.setVcfFilenames("test-data/chr20.dose.vcf.gz"); task.setRiskScoreFilenames("test-data/chr20.scores.csv"); Chunk chunk = new Chunk(); From 02fb3c874db5725ac2ee1b4f711fe78ccd4ca263 Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Mon, 15 Jun 2020 17:42:08 +0200 Subject: [PATCH 5/9] Add multiple score files to README.md --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index e19b85c..5b425a1 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,14 @@ Apply PRS to multiple files by using file patterns: ./pgs-calc --ref PGS000018.txt.gz test.chr*.vcf.gz --out scores.txt ``` +#### Multiple scores + +Apply multiple score files: + +``` +./pgs-calc --ref PGS000018.txt.gz,PGS000027.txt.gz test.chr*.vcf.gz --out scores.txt +``` + #### Filter by Imputation Qualitity From a7370e505548010ca14a28fcc0be85cc7a33bcef Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Mon, 15 Jun 2020 17:44:15 +0200 Subject: [PATCH 6/9] Add missing Exception to testcase --- .../java/genepi/riskscore/commands/ApplyScoreCommandTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java b/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java index e87f88e..d17009a 100644 --- a/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java +++ b/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java @@ -32,7 +32,7 @@ public void testCall() { } @Test - public void testCallWithMultipleScores() { + public void testCallWithMultipleScores() throws IOException { String[] args = { "test-data/test.chr1.vcf", "test-data/test.chr2.vcf", "--ref", "test-data/test.scores.csv,test-data/test.scores.csv,test-data/test.scores.csv", "--out", From e3995c3d868c4c6e8377186d40edf252cb7b84bd Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Thu, 18 Jun 2020 12:39:21 +0200 Subject: [PATCH 7/9] Improve performance using lazy loading --- .../io/vcf/MinimalVariantContext.java | 69 ++++++++++--------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java b/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java index f9ef44d..5921083 100644 --- a/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java +++ b/src/main/java/genepi/riskscore/io/vcf/MinimalVariantContext.java @@ -42,6 +42,8 @@ public class MinimalVariantContext { private Map infos; + private boolean dirtyGenotypes = true; + public MinimalVariantContext(int samples) { genotypes = new boolean[samples]; genotypesParsed = new float[samples]; @@ -133,6 +135,7 @@ public int getNSamples() { public void setRawLine(String rawLine) { this.rawLine = rawLine; this.id = null; + this.dirtyGenotypes = true; } public String getRawLine() { @@ -230,40 +233,44 @@ public double getInfoAsDouble(String key, double defaultValue) { } public float[] getGenotypeDosages(String field) throws IOException { - - //TODO: implement dirty flag, and parse genotypes only once - - String tiles[] = rawLine.split("\t", 10); - String[] formats = tiles[8].split(":"); - int index = -1; - for (int i = 0; i < formats.length; i++) { - if (formats[i].equals(field)) { - index = i; + + if (dirtyGenotypes) { + + String tiles[] = rawLine.split("\t", 10); + String[] formats = tiles[8].split(":"); + int index = -1; + for (int i = 0; i < formats.length; i++) { + if (formats[i].equals(field)) { + index = i; + } } - } - if (index == -1) { - throw new IOException("field '" + field + "' not found in FORMAT"); - } - String[] values = tiles[9].split("\t"); - for (int i = 0; i < genotypesParsed.length; i++) { - String value = values[i]; - String[] tiles2 = value.split(":"); - String genotype = tiles2[index]; - - float dosage = 0; - // genotypes - if (genotype.equals("0|0")) { - dosage = 0; - } else if (genotype.equals("0|1") || genotype.equals("1|0")) { - dosage = 1; - } else if (genotype.equals("1|1")) { - dosage = 2; - } else { - // dosage - dosage = Float.parseFloat(genotype); + if (index == -1) { + throw new IOException("field '" + field + "' not found in FORMAT"); + } + String[] values = tiles[9].split("\t"); + for (int i = 0; i < genotypesParsed.length; i++) { + String value = values[i]; + String[] tiles2 = value.split(":"); + String genotype = tiles2[index]; + + float dosage = 0; + // genotypes + if (genotype.equals("0|0")) { + dosage = 0; + } else if (genotype.equals("0|1") || genotype.equals("1|0")) { + dosage = 1; + } else if (genotype.equals("1|1")) { + dosage = 2; + } else { + // dosage + dosage = Float.parseFloat(genotype); + } + genotypesParsed[i] = dosage; + } - genotypesParsed[i] = dosage; + + dirtyGenotypes = false; } return genotypesParsed; From 93aaccb58f5b297d6387db903d5fd9f06b4e344b Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Thu, 18 Jun 2020 13:03:39 +0200 Subject: [PATCH 8/9] Support IDs from PGSCatalog and download files automatically --- .../java/genepi/riskscore/io/PGSCatalog.java | 45 +++++++++++++++++++ .../genepi/riskscore/io/RiskScoreFile.java | 17 +++++-- .../riskscore/tasks/ApplyScoreTask.java | 4 +- .../commands/ApplyScoreCommandTest.java | 19 ++++++++ 4 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 src/main/java/genepi/riskscore/io/PGSCatalog.java diff --git a/src/main/java/genepi/riskscore/io/PGSCatalog.java b/src/main/java/genepi/riskscore/io/PGSCatalog.java new file mode 100644 index 0000000..e8812de --- /dev/null +++ b/src/main/java/genepi/riskscore/io/PGSCatalog.java @@ -0,0 +1,45 @@ +package genepi.riskscore.io; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.nio.file.StandardCopyOption; +import java.text.MessageFormat; + +import genepi.io.FileUtil; + +public class PGSCatalog { + + public static String USER_HOME = System.getProperty("user.home"); + + public static String CACHE_DIR = FileUtil.path(USER_HOME, ".pgs-calc", "pgs-catalog"); + + public static String FILE_URL = "http://ftp.ebi.ac.uk/pub/databases/spot/pgs/scores/{0}/ScoringFiles/{0}.txt.gz"; + + public static String getFilenameById(String id) throws IOException { + + String filename = FileUtil.path(CACHE_DIR, id + ".txt.gz"); + + if ((new File(filename)).exists()) { + System.out.println("Score '" + id + "' found in local cache " + filename); + return filename; + } + + FileUtil.createDirectory(CACHE_DIR); + + MessageFormat format = new MessageFormat(FILE_URL); + String url = format.format(new Object[] { id }); + + System.out.println("Downloading score '" + id + "' from " + url + "..."); + + InputStream in = new URL(url).openStream(); + Files.copy(in, Paths.get(filename), StandardCopyOption.REPLACE_EXISTING); + + return filename; + + } + +} diff --git a/src/main/java/genepi/riskscore/io/RiskScoreFile.java b/src/main/java/genepi/riskscore/io/RiskScoreFile.java index aaae848..eba46d9 100644 --- a/src/main/java/genepi/riskscore/io/RiskScoreFile.java +++ b/src/main/java/genepi/riskscore/io/RiskScoreFile.java @@ -36,13 +36,22 @@ public RiskScoreFile(String filename, RiskScoreFormat format) throws Exception { variants = new HashMap(); if (!new File(filename).exists()) { - throw new Exception("File '" + filename + "' not found."); + + // check if its a PGS id + if (filename.startsWith("PGS") && filename.length() == 9 && !filename.endsWith(".txt.gz")) { + String id = filename; + this.filename = PGSCatalog.getFilenameById(id); + } else { + + throw new Exception("File '" + filename + "' not found."); + + } } - DataInputStream in = openTxtOrGzipStream(filename); + DataInputStream in = openTxtOrGzipStream(this.filename); ITableReader reader = new CsvTableReader(in, RiskScoreFormat.SEPARATOR); - checkFileFormat(reader, filename); + checkFileFormat(reader, this.filename); reader.close(); } @@ -70,7 +79,7 @@ private void checkFileFormat(ITableReader reader, String filename) throws Except } public void buildIndex(String chromosome) throws IOException { - buildIndex(chromosome, new Chunk()); + buildIndex(chromosome, new Chunk()); } public void buildIndex(String chromosome, Chunk chunk) throws IOException { diff --git a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java index 7ee9f5e..fabfb03 100644 --- a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java +++ b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java @@ -151,9 +151,11 @@ private void processVCF(String vcfFilename, String... riskScoreFilenames) throws } RiskScoreFile[] riskscores = new RiskScoreFile[numberRiskScores]; for (int i = 0; i < numberRiskScores; i++) { + + System.out.println("Loading file " + riskScoreFilenames[i] + "..."); + RiskScoreFormat format = formats.get(riskScoreFilenames[i]); RiskScoreFile riskscore = new RiskScoreFile(riskScoreFilenames[i], format); - System.out.println("Loading file " + riskScoreFilenames[i] + "..."); if (chunk != null) { riskscore.buildIndex(chromosome, chunk); diff --git a/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java b/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java index d17009a..2541b56 100644 --- a/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java +++ b/src/test/java/genepi/riskscore/commands/ApplyScoreCommandTest.java @@ -30,6 +30,25 @@ public void testCall() { assertEquals(EXPECTED_SAMPLES, samples); reader.close(); } + + + @Test + public void testCallWithPGSID() { + + String[] args = { "test-data/chr20.dose.vcf.gz", "--ref", "PGS000028", "--out", "output.csv" }; + int result = new CommandLine(new ApplyScoreCommand()).execute(args); + assertEquals(0, result); + + int samples = 0; + ITableReader reader = new CsvTableReader("output.csv", ','); + while (reader.next()) { + samples++; + + } + assertEquals(EXPECTED_SAMPLES, samples); + reader.close(); + } + @Test public void testCallWithMultipleScores() throws IOException { From fa1b36ec1bf8d9854aac98f4ec6bd13136558397 Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Thu, 18 Jun 2020 13:07:00 +0200 Subject: [PATCH 9/9] Prepare release 0.8.3 --- pom.xml | 4 ++-- src/main/java/genepi/riskscore/App.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index b798877..d8aad80 100644 --- a/pom.xml +++ b/pom.xml @@ -3,9 +3,9 @@ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> 4.0.0 - genepi + lukfor pgs-calc - 0.8.2 + 0.8.3 jar riskscore diff --git a/src/main/java/genepi/riskscore/App.java b/src/main/java/genepi/riskscore/App.java index 59a0516..379e252 100644 --- a/src/main/java/genepi/riskscore/App.java +++ b/src/main/java/genepi/riskscore/App.java @@ -8,7 +8,7 @@ public class App { public static final String APP = "pgs-calc"; - public static final String VERSION = "0.8.2"; + public static final String VERSION = "0.8.3"; public static final String URL = "https://github.com/lukfor/pgs-calc";