Skip to content

Commit

Permalink
Add bgzip support to collections and info files
Browse files Browse the repository at this point in the history
  • Loading branch information
lukfor committed Dec 3, 2023
1 parent c6abe8a commit ad98b9b
Show file tree
Hide file tree
Showing 8 changed files with 198 additions and 39 deletions.
165 changes: 138 additions & 27 deletions src/main/java/genepi/riskscore/commands/CreateCollectionCommand.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package genepi.riskscore.commands;

import java.io.File;
import java.io.OutputStreamWriter;
import java.util.*;
import java.util.concurrent.Callable;

Expand All @@ -13,6 +15,7 @@
import genepi.riskscore.io.formats.PGSCatalogHarmonizedFormat;
import genepi.riskscore.io.formats.RiskScoreFormatImpl;
import genepi.riskscore.io.scores.MergedRiskScoreCollection;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import picocli.CommandLine.Command;
import picocli.CommandLine.Option;
import picocli.CommandLine.Parameters;
Expand All @@ -24,7 +27,7 @@ public class CreateCollectionCommand implements Callable<Integer> {
private String output = null;

@Parameters(description = "score files")
private String[] filenames;
private String[] files;

public static String[] chromosomeOrder = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "X", "Y", "XY"};

Expand All @@ -39,30 +42,43 @@ public class CreateCollectionCommand implements Callable<Integer> {
@Override
public Integer call() throws Exception {

PGSCatalogHarmonizedFormat format = new PGSCatalogHarmonizedFormat();

List<String> validFilenames = new Vector<String>(files.length);
//validate files
for (String filename: files){
if (checkFileFormat(filename, format)){
validFilenames.add(filename);
}
}

String[] filenames = new String[validFilenames.size()];
filenames = validFilenames.toArray(filenames);
String[] names = new String[filenames.length];
int[] totalVariants = new int[filenames.length];
int[] ignoredVariants = new int[filenames.length];
CsvWithHeaderTableReader[] readers = new CsvWithHeaderTableReader[filenames.length];
RiskScoreFormatImpl[] formats = new RiskScoreFormatImpl[filenames.length];
Variant[] variants = new Variant[filenames.length];

for (int i = 0; i < filenames.length; i++) {
names[i] = RiskScoreFile.getName(filenames[i]);
formats[i] = new PGSCatalogHarmonizedFormat();
readers[i] = new CsvWithHeaderTableReader(filenames[i], formats[i].getSeparator());
readers[i] = new CsvWithHeaderTableReader(filenames[i], format.getSeparator());
try {
variants[i] = readVariant(readers[i], formats[i]);
variants[i] = readVariant(readers[i], format);
} catch (Exception e) {
throw new RuntimeException("File " + filenames[i], e);
}
}

List<String> header = new Vector<String>();
header.add(MergedRiskScoreCollection.HEADER);
header.add("#Date=" + new Date());
header.add("#Scores=" + filenames.length);
header.add("# Date=" + new Date());
header.add("# Scores=" + filenames.length);


CsvWithHeaderTableWriter writer = null;
if (output != null) {
writer = new CsvWithHeaderTableWriter(output, '\t', header);
writer = new CsvWithHeaderTableWriter(new OutputStreamWriter(new BlockCompressedOutputStream(new File(output))), '\t', header);
} else {
writer = new CsvWithHeaderTableWriter('\t', header);
}
Expand All @@ -80,16 +96,24 @@ public Integer call() throws Exception {
addVariant(writer, variant);
for (int i = 0; i < variants.length; i++) {
if (variants[i] != null && variants[i].matches(variant)) {
writeVariant(writer, names[i], variants[i].getNormalizedEffect(variant));
variants[i].align(variant);
writeVariant(writer, names[i], variants[i]);
Variant nextVariant = null;
try {
nextVariant = readVariant(readers[i], formats[i]);
} catch (Exception e) {
throw new RuntimeException("File " + filenames[i], e);
boolean read = true;
while(read) {
try {
nextVariant = readVariant(readers[i], format);
read = false;
}catch (VariantReadingException e){
ignoredVariants[i]++;
} catch (Exception e) {
throw new RuntimeException("File " + filenames[i], e);
}
}
if (nextVariant != null && nextVariant.isBefore(variants[i])) {
throw new RuntimeException(filenames[i] + ": Not sorted. " + nextVariant + " is before " + variants[i]);
}
totalVariants[i]++;
variants[i] = nextVariant;
} else {
writeMissing(writer, names[i]);
Expand All @@ -106,6 +130,18 @@ public Integer call() throws Exception {
reader.close();
}

if (output != null) {
CsvWithHeaderTableWriter writerMeta = new CsvWithHeaderTableWriter(output + MergedRiskScoreCollection.META_EXTENSION, '\t', header);
writerMeta.setColumns(new String[]{"score","variants","ignored"});
for (int i = 0; i < names.length; i++){
writerMeta.setString("score",names[i]);
writerMeta.setInteger("variants",totalVariants[i]);
writerMeta.setInteger("ignored",ignoredVariants[i]);
writerMeta.next();
}
writerMeta.close();
}

System.err.println("Wrote " + variantsWritten + " unique variants and " + filenames.length + " scores.");

return 0;
Expand All @@ -117,7 +153,7 @@ public void setOutput(String output) {
}

public void setFilenames(String[] filenames) {
this.filenames = filenames;
this.files = filenames;
}


Expand Down Expand Up @@ -156,19 +192,57 @@ private String[] merge(String[] first, String[] second) {
return result;
}

public Variant readVariant(ITableReader reader, RiskScoreFormatImpl format) {
public Variant readVariant(ITableReader reader, RiskScoreFormatImpl format) throws VariantReadingException {
int row = 0;

if (!reader.next()) {
return null;
}
Variant variant = new Variant();
variant.setChromosome(reader.getString(format.getChromosome()));

String chromosome = reader.getString(format.getChromosome());
if (!chromosomeOrderIndex.containsKey(chromosome)){
throw new VariantReadingException("Row " + row + ": Chromosome is invalid.");
}

if (reader.getString(format.getPosition()).isEmpty()) {
throw new RuntimeException("Not position found.");
throw new VariantReadingException("Row " + row + ": Position is empty. Ignore variant.");
}

int position = 0;
try {
position = reader.getInteger(format.getPosition());

} catch (NumberFormatException e) {
throw new VariantReadingException("Row " + row + ": '" + reader.getString(format.getPosition())
+ "' is an invalid position. Ignore variant.");
}

float effectWeight = 0;
try {
effectWeight = ((Double) (reader.getDouble(format.getEffectWeight()))).floatValue();
} catch (NumberFormatException e) {
throw new VariantReadingException("Row " + row + ": '" + reader.getString(format.getEffectWeight())
+ "' is an invalid weight. Ignore variant.");
}
variant.setPosition(reader.getInteger(format.getPosition()));
variant.setEffectAllele(reader.getString(format.getEffectAllele()));
variant.setOtherAllele(reader.getString(format.getOtherAllele()));
variant.setEffect(reader.getDouble(format.getEffectWeight()));

String rawOtherA = reader.getString(format.getOtherAllele());
if (rawOtherA.isEmpty()) {
throw new VariantReadingException("Row " + row + ": Other allele is empty. Ignore variant.");
}
String otherAllele = rawOtherA.trim();

String rawEffectAllele = reader.getString(format.getEffectAllele());
if (rawEffectAllele.isEmpty()) {
throw new VariantReadingException("Row " + row + ": Effect allele is empty. Ignore variant.");
}
String effectAllele = rawEffectAllele.trim();

Variant variant = new Variant();
variant.setChromosome(chromosome);
variant.setPosition(position);
variant.setEffectAllele(effectAllele);
variant.setOtherAllele(otherAllele);
variant.setEffect(effectWeight);
return variant;
}

Expand All @@ -180,8 +254,8 @@ public void addVariant(ITableWriter writer, Variant variant) {
writer.setString(MergedRiskScoreCollection.COLUMN_OTHER_ALLELE, variant.getOtherAllele());
}

public void writeVariant(ITableWriter writer, String score, double effect) {
writer.setDouble(score, effect);
public void writeVariant(ITableWriter writer, String score, Variant variant) {
writer.setDouble(score, variant.getEffect());
}

public void writeMissing(ITableWriter writer, String score) {
Expand Down Expand Up @@ -235,18 +309,26 @@ public void setOtherAllele(String otherAllele) {
this.otherAllele = otherAllele;
}

public double getNormalizedEffect(Variant variant) {
public void align(Variant variant) {
if (this.hasSameAlleles(variant)) {
return effect;
return;
}

if (this.hasSwappedAlleles(variant)) {
return -effect;
swapAlleles();
return;
}

throw new RuntimeException("Error. Wrong alleles!!");
}

public void swapAlleles(){
String _otherAllele = otherAllele;
otherAllele = effectAllele;
effectAllele = _otherAllele;
effect = -effect;
}

private boolean hasSameAlleles(Variant variant) {
return this.effectAllele.equals(variant.effectAllele) && this.otherAllele.equals(variant.otherAllele);
}
Expand Down Expand Up @@ -295,5 +377,34 @@ public boolean matches(Variant variant) {
}
}


private boolean checkFileFormat(String filename, RiskScoreFormatImpl format) throws Exception {
CsvWithHeaderTableReader reader = new CsvWithHeaderTableReader(filename, format.getSeparator());
reader.close();
if (!reader.hasColumn(format.getChromosome())) {
System.out.println("Column '" + format.getChromosome() + "' not found in '" + filename + "'. Ignore.");
return false;
}
if (!reader.hasColumn(format.getPosition())) {
System.out.println("Column '" + format.getPosition() + "' not found in '" + filename + "'. Ignore.");
return false;
}
if (!reader.hasColumn(format.getEffectWeight())) {
System.out.println("Column '" + format.getEffectWeight() + "' not found in '" + filename + "'. Ignore.");
return false;
}
if (!reader.hasColumn(format.getOtherAllele())) {
System.out.println("Column '" + format.getOtherAllele() + "' not found in '" + filename + "'. Ignore.");
return false;
}
if (!reader.hasColumn(format.getEffectAllele())) {
System.out.println("Column '" + format.getEffectAllele() + "' not found in '" + filename + "'. Ignore.");
return false;
}

return true;

}

}

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package genepi.riskscore.commands;

public class VariantReadingException extends Exception {
public VariantReadingException(String message) {
super(message);
}
}
3 changes: 3 additions & 0 deletions src/main/java/genepi/riskscore/io/RiskScoreFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -277,4 +277,7 @@ public void warning(String text) {
}
}

public void clearIndex() {
variants = null;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.FileWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -46,6 +47,17 @@ public CsvWithHeaderTableWriter( char separator, List<String> header) {
CSVWriter.NO_ESCAPE_CHARACTER);
}

public CsvWithHeaderTableWriter(Writer stream, char separator, List<String> header) throws IOException {
for (String line : header) {
stream.write(line.replace("\n", "").replace("\r", ""));
stream.write(System.lineSeparator());
}
stream.write("# Created by " + App.APP + " " + App.VERSION);
stream.write(System.lineSeparator());
writer = new CSVWriter(stream, separator, CSVWriter.NO_QUOTE_CHARACTER,
CSVWriter.NO_ESCAPE_CHARACTER);
}

@Override
public void close() {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ public interface IRiskScoreCollection {

public RiskScoreSummary[] getSummaries();

public void clearIndex();
}
Loading

0 comments on commit ad98b9b

Please sign in to comment.