From ad98b9b7c676d16172799ad0cd5ee926a3d3d3d0 Mon Sep 17 00:00:00 2001 From: Lukas Forer Date: Sun, 3 Dec 2023 14:44:58 +0100 Subject: [PATCH] Add bgzip support to collections and info files --- .../commands/CreateCollectionCommand.java | 165 +++++++++++++++--- .../commands/VariantReadingException.java | 7 + .../genepi/riskscore/io/RiskScoreFile.java | 3 + .../io/csv/CsvWithHeaderTableWriter.java | 12 ++ .../io/scores/IRiskScoreCollection.java | 1 + .../io/scores/MergedRiskScoreCollection.java | 39 +++-- .../io/scores/RiskScoreCollection.java | 7 + .../riskscore/tasks/ApplyScoreTask.java | 3 + 8 files changed, 198 insertions(+), 39 deletions(-) create mode 100644 src/main/java/genepi/riskscore/commands/VariantReadingException.java diff --git a/src/main/java/genepi/riskscore/commands/CreateCollectionCommand.java b/src/main/java/genepi/riskscore/commands/CreateCollectionCommand.java index 5f3dee0..c8ed5db 100644 --- a/src/main/java/genepi/riskscore/commands/CreateCollectionCommand.java +++ b/src/main/java/genepi/riskscore/commands/CreateCollectionCommand.java @@ -1,5 +1,7 @@ package genepi.riskscore.commands; +import java.io.File; +import java.io.OutputStreamWriter; import java.util.*; import java.util.concurrent.Callable; @@ -13,6 +15,7 @@ import genepi.riskscore.io.formats.PGSCatalogHarmonizedFormat; import genepi.riskscore.io.formats.RiskScoreFormatImpl; import genepi.riskscore.io.scores.MergedRiskScoreCollection; +import htsjdk.samtools.util.BlockCompressedOutputStream; import picocli.CommandLine.Command; import picocli.CommandLine.Option; import picocli.CommandLine.Parameters; @@ -24,7 +27,7 @@ public class CreateCollectionCommand implements Callable { private String output = null; @Parameters(description = "score files") - private String[] filenames; + private String[] files; public static String[] chromosomeOrder = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "XY"}; @@ -39,17 +42,29 @@ public class CreateCollectionCommand implements Callable { @Override public Integer call() throws Exception { + PGSCatalogHarmonizedFormat format = new PGSCatalogHarmonizedFormat(); + + List validFilenames = new Vector(files.length); + //validate files + for (String filename: files){ + if (checkFileFormat(filename, format)){ + validFilenames.add(filename); + } + } + + String[] filenames = new String[validFilenames.size()]; + filenames = validFilenames.toArray(filenames); String[] names = new String[filenames.length]; + int[] totalVariants = new int[filenames.length]; + int[] ignoredVariants = new int[filenames.length]; CsvWithHeaderTableReader[] readers = new CsvWithHeaderTableReader[filenames.length]; - RiskScoreFormatImpl[] formats = new RiskScoreFormatImpl[filenames.length]; Variant[] variants = new Variant[filenames.length]; for (int i = 0; i < filenames.length; i++) { names[i] = RiskScoreFile.getName(filenames[i]); - formats[i] = new PGSCatalogHarmonizedFormat(); - readers[i] = new CsvWithHeaderTableReader(filenames[i], formats[i].getSeparator()); + readers[i] = new CsvWithHeaderTableReader(filenames[i], format.getSeparator()); try { - variants[i] = readVariant(readers[i], formats[i]); + variants[i] = readVariant(readers[i], format); } catch (Exception e) { throw new RuntimeException("File " + filenames[i], e); } @@ -57,12 +72,13 @@ public Integer call() throws Exception { List header = new Vector(); header.add(MergedRiskScoreCollection.HEADER); - header.add("#Date=" + new Date()); - header.add("#Scores=" + filenames.length); + header.add("# Date=" + new Date()); + header.add("# Scores=" + filenames.length); + CsvWithHeaderTableWriter writer = null; if (output != null) { - writer = new CsvWithHeaderTableWriter(output, '\t', header); + writer = new CsvWithHeaderTableWriter(new OutputStreamWriter(new BlockCompressedOutputStream(new File(output))), '\t', header); } else { writer = new CsvWithHeaderTableWriter('\t', header); } @@ -80,16 +96,24 @@ public Integer call() throws Exception { addVariant(writer, variant); for (int i = 0; i < variants.length; i++) { if (variants[i] != null && variants[i].matches(variant)) { - writeVariant(writer, names[i], variants[i].getNormalizedEffect(variant)); + variants[i].align(variant); + writeVariant(writer, names[i], variants[i]); Variant nextVariant = null; - try { - nextVariant = readVariant(readers[i], formats[i]); - } catch (Exception e) { - throw new RuntimeException("File " + filenames[i], e); + boolean read = true; + while(read) { + try { + nextVariant = readVariant(readers[i], format); + read = false; + }catch (VariantReadingException e){ + ignoredVariants[i]++; + } catch (Exception e) { + throw new RuntimeException("File " + filenames[i], e); + } } if (nextVariant != null && nextVariant.isBefore(variants[i])) { throw new RuntimeException(filenames[i] + ": Not sorted. " + nextVariant + " is before " + variants[i]); } + totalVariants[i]++; variants[i] = nextVariant; } else { writeMissing(writer, names[i]); @@ -106,6 +130,18 @@ public Integer call() throws Exception { reader.close(); } + if (output != null) { + CsvWithHeaderTableWriter writerMeta = new CsvWithHeaderTableWriter(output + MergedRiskScoreCollection.META_EXTENSION, '\t', header); + writerMeta.setColumns(new String[]{"score","variants","ignored"}); + for (int i = 0; i < names.length; i++){ + writerMeta.setString("score",names[i]); + writerMeta.setInteger("variants",totalVariants[i]); + writerMeta.setInteger("ignored",ignoredVariants[i]); + writerMeta.next(); + } + writerMeta.close(); + } + System.err.println("Wrote " + variantsWritten + " unique variants and " + filenames.length + " scores."); return 0; @@ -117,7 +153,7 @@ public void setOutput(String output) { } public void setFilenames(String[] filenames) { - this.filenames = filenames; + this.files = filenames; } @@ -156,19 +192,57 @@ private String[] merge(String[] first, String[] second) { return result; } - public Variant readVariant(ITableReader reader, RiskScoreFormatImpl format) { + public Variant readVariant(ITableReader reader, RiskScoreFormatImpl format) throws VariantReadingException { + int row = 0; + if (!reader.next()) { return null; } - Variant variant = new Variant(); - variant.setChromosome(reader.getString(format.getChromosome())); + + String chromosome = reader.getString(format.getChromosome()); + if (!chromosomeOrderIndex.containsKey(chromosome)){ + throw new VariantReadingException("Row " + row + ": Chromosome is invalid."); + } + if (reader.getString(format.getPosition()).isEmpty()) { - throw new RuntimeException("Not position found."); + throw new VariantReadingException("Row " + row + ": Position is empty. Ignore variant."); + } + + int position = 0; + try { + position = reader.getInteger(format.getPosition()); + + } catch (NumberFormatException e) { + throw new VariantReadingException("Row " + row + ": '" + reader.getString(format.getPosition()) + + "' is an invalid position. Ignore variant."); + } + + float effectWeight = 0; + try { + effectWeight = ((Double) (reader.getDouble(format.getEffectWeight()))).floatValue(); + } catch (NumberFormatException e) { + throw new VariantReadingException("Row " + row + ": '" + reader.getString(format.getEffectWeight()) + + "' is an invalid weight. Ignore variant."); } - variant.setPosition(reader.getInteger(format.getPosition())); - variant.setEffectAllele(reader.getString(format.getEffectAllele())); - variant.setOtherAllele(reader.getString(format.getOtherAllele())); - variant.setEffect(reader.getDouble(format.getEffectWeight())); + + String rawOtherA = reader.getString(format.getOtherAllele()); + if (rawOtherA.isEmpty()) { + throw new VariantReadingException("Row " + row + ": Other allele is empty. Ignore variant."); + } + String otherAllele = rawOtherA.trim(); + + String rawEffectAllele = reader.getString(format.getEffectAllele()); + if (rawEffectAllele.isEmpty()) { + throw new VariantReadingException("Row " + row + ": Effect allele is empty. Ignore variant."); + } + String effectAllele = rawEffectAllele.trim(); + + Variant variant = new Variant(); + variant.setChromosome(chromosome); + variant.setPosition(position); + variant.setEffectAllele(effectAllele); + variant.setOtherAllele(otherAllele); + variant.setEffect(effectWeight); return variant; } @@ -180,8 +254,8 @@ public void addVariant(ITableWriter writer, Variant variant) { writer.setString(MergedRiskScoreCollection.COLUMN_OTHER_ALLELE, variant.getOtherAllele()); } - public void writeVariant(ITableWriter writer, String score, double effect) { - writer.setDouble(score, effect); + public void writeVariant(ITableWriter writer, String score, Variant variant) { + writer.setDouble(score, variant.getEffect()); } public void writeMissing(ITableWriter writer, String score) { @@ -235,18 +309,26 @@ public void setOtherAllele(String otherAllele) { this.otherAllele = otherAllele; } - public double getNormalizedEffect(Variant variant) { + public void align(Variant variant) { if (this.hasSameAlleles(variant)) { - return effect; + return; } if (this.hasSwappedAlleles(variant)) { - return -effect; + swapAlleles(); + return; } throw new RuntimeException("Error. Wrong alleles!!"); } + public void swapAlleles(){ + String _otherAllele = otherAllele; + otherAllele = effectAllele; + effectAllele = _otherAllele; + effect = -effect; + } + private boolean hasSameAlleles(Variant variant) { return this.effectAllele.equals(variant.effectAllele) && this.otherAllele.equals(variant.otherAllele); } @@ -295,5 +377,34 @@ public boolean matches(Variant variant) { } } + + private boolean checkFileFormat(String filename, RiskScoreFormatImpl format) throws Exception { + CsvWithHeaderTableReader reader = new CsvWithHeaderTableReader(filename, format.getSeparator()); + reader.close(); + if (!reader.hasColumn(format.getChromosome())) { + System.out.println("Column '" + format.getChromosome() + "' not found in '" + filename + "'. Ignore."); + return false; + } + if (!reader.hasColumn(format.getPosition())) { + System.out.println("Column '" + format.getPosition() + "' not found in '" + filename + "'. Ignore."); + return false; + } + if (!reader.hasColumn(format.getEffectWeight())) { + System.out.println("Column '" + format.getEffectWeight() + "' not found in '" + filename + "'. Ignore."); + return false; + } + if (!reader.hasColumn(format.getOtherAllele())) { + System.out.println("Column '" + format.getOtherAllele() + "' not found in '" + filename + "'. Ignore."); + return false; + } + if (!reader.hasColumn(format.getEffectAllele())) { + System.out.println("Column '" + format.getEffectAllele() + "' not found in '" + filename + "'. Ignore."); + return false; + } + + return true; + + } + } diff --git a/src/main/java/genepi/riskscore/commands/VariantReadingException.java b/src/main/java/genepi/riskscore/commands/VariantReadingException.java new file mode 100644 index 0000000..0685f93 --- /dev/null +++ b/src/main/java/genepi/riskscore/commands/VariantReadingException.java @@ -0,0 +1,7 @@ +package genepi.riskscore.commands; + +public class VariantReadingException extends Exception { + public VariantReadingException(String message) { + super(message); + } +} diff --git a/src/main/java/genepi/riskscore/io/RiskScoreFile.java b/src/main/java/genepi/riskscore/io/RiskScoreFile.java index 94c53df..1ab85d6 100644 --- a/src/main/java/genepi/riskscore/io/RiskScoreFile.java +++ b/src/main/java/genepi/riskscore/io/RiskScoreFile.java @@ -277,4 +277,7 @@ public void warning(String text) { } } + public void clearIndex() { + variants = null; + } } diff --git a/src/main/java/genepi/riskscore/io/csv/CsvWithHeaderTableWriter.java b/src/main/java/genepi/riskscore/io/csv/CsvWithHeaderTableWriter.java index a13bd2c..0b7c371 100644 --- a/src/main/java/genepi/riskscore/io/csv/CsvWithHeaderTableWriter.java +++ b/src/main/java/genepi/riskscore/io/csv/CsvWithHeaderTableWriter.java @@ -3,6 +3,7 @@ import java.io.FileWriter; import java.io.IOException; import java.io.OutputStreamWriter; +import java.io.Writer; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -46,6 +47,17 @@ public CsvWithHeaderTableWriter( char separator, List header) { CSVWriter.NO_ESCAPE_CHARACTER); } + public CsvWithHeaderTableWriter(Writer stream, char separator, List header) throws IOException { + for (String line : header) { + stream.write(line.replace("\n", "").replace("\r", "")); + stream.write(System.lineSeparator()); + } + stream.write("# Created by " + App.APP + " " + App.VERSION); + stream.write(System.lineSeparator()); + writer = new CSVWriter(stream, separator, CSVWriter.NO_QUOTE_CHARACTER, + CSVWriter.NO_ESCAPE_CHARACTER); + } + @Override public void close() { try { diff --git a/src/main/java/genepi/riskscore/io/scores/IRiskScoreCollection.java b/src/main/java/genepi/riskscore/io/scores/IRiskScoreCollection.java index e8b4dc5..91eb4b8 100644 --- a/src/main/java/genepi/riskscore/io/scores/IRiskScoreCollection.java +++ b/src/main/java/genepi/riskscore/io/scores/IRiskScoreCollection.java @@ -32,4 +32,5 @@ public interface IRiskScoreCollection { public RiskScoreSummary[] getSummaries(); + public void clearIndex(); } diff --git a/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java b/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java index 16bcca0..d98646c 100644 --- a/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java +++ b/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java @@ -9,10 +9,8 @@ import genepi.riskscore.model.RiskScoreSummary; import jxl.format.PaperSize; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; +import java.io.File; +import java.util.*; public class MergedRiskScoreCollection implements IRiskScoreCollection { @@ -34,6 +32,8 @@ public class MergedRiskScoreCollection implements IRiskScoreCollection { public static String HEADER = "# PGS-Collection v1"; + public static String META_EXTENSION = ".info"; + public static String COLUMN_CHROMOSOME = "chr_name"; public static String COLUMN_POSITION = "chr_position"; @@ -73,6 +73,22 @@ public String getVersion() { @Override public void buildIndex(String chromosome, Chunk chunk, String dbsnp, String proxies) throws Exception { + String metaFilename = filename + META_EXTENSION; + File metaFile = new File(metaFilename); + if (!metaFile.exists()){ + throw new RuntimeException("No meta file '" + metaFilename + "' found."); + } + + Map metaIndex = new HashMap(); + Map metaIndex2 = new HashMap(); + CsvWithHeaderTableReader readerMeta = new CsvWithHeaderTableReader(metaFilename, '\t'); + while(readerMeta.next()){ + metaIndex.put(readerMeta.getString("score"), readerMeta.getInteger("variants")); + metaIndex2.put(readerMeta.getString("score"), readerMeta.getInteger("ignored")); + } + readerMeta.close(); + + CsvWithHeaderTableReader reader = new CsvWithHeaderTableReader(filename, '\t'); String[] columns = reader.getColumns(); @@ -85,6 +101,8 @@ public void buildIndex(String chromosome, Chunk chunk, String dbsnp, String prox continue; } summaries[index] = new RiskScoreSummary(column); + summaries[index].setVariants(metaIndex.get(column)); + summaries[index].setVariantsIgnored(0);//TODO: merge sums them up. metaIndex2.get(column)); variantsIndex.put(index, new HashMap()); index++; } @@ -125,17 +143,9 @@ public void buildIndex(String chromosome, Chunk chunk, String dbsnp, String prox //TODO: rethink: better position -> score index? easier to check.. ReferenceVariant variant = new ReferenceVariant(otherAllele, effectAllele, weight.floatValue()); variantsIndex.get(index).put(position, variant); - - //TODO; just testing; we need all variants from whole file! - summaries[index].incVariants(); index++; } - //TODO: total variants. meta file? - //go to chunk or complete file. - //summaries[i].setVariants(riskscore.getTotalVariants()); - //summaries[i].setVariantsIgnored(riskscore.getIgnoredVariants()); - } reader.close(); @@ -173,6 +183,11 @@ public Set> getAllVariants(int index) { return variantsIndex.get(index).entrySet(); } + @Override + public void clearIndex() { + variantsIndex = null; + } + @Override public int getSize() { return numberRiskScores; diff --git a/src/main/java/genepi/riskscore/io/scores/RiskScoreCollection.java b/src/main/java/genepi/riskscore/io/scores/RiskScoreCollection.java index 952a29f..394b000 100644 --- a/src/main/java/genepi/riskscore/io/scores/RiskScoreCollection.java +++ b/src/main/java/genepi/riskscore/io/scores/RiskScoreCollection.java @@ -121,6 +121,13 @@ public int getSize() { return numberRiskScores; } + @Override + public void clearIndex() { + for (RiskScoreFile riskscore: riskscores) { + riskscore.clearIndex(); + } + } + @Override public boolean isEmpty() { for (RiskScoreFile riskscore : riskscores) { diff --git a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java index d1f0019..3ceee06 100644 --- a/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java +++ b/src/main/java/genepi/riskscore/tasks/ApplyScoreTask.java @@ -189,6 +189,9 @@ public void run(ITaskMonitor monitor) throws Exception { reportFile.save(outputReportFilename); } + //cleanup + collection.clearIndex(); + monitor.done(); } catch (Exception e) { if (VERBOSE) {