Skip to content

Commit

Permalink
Modify VcfToAdpc to handle multiple samples. (#1470)
Browse files Browse the repository at this point in the history
Put NaN in the adc.bin file if normalized intensity is null.
Output files containing number of samples and markers (useful for running VerifyIDIntensity itself).
Added tests
  • Loading branch information
gbggrant authored Feb 26, 2020
1 parent 4782078 commit 81bef8d
Show file tree
Hide file tree
Showing 19 changed files with 555 additions and 56 deletions.
108 changes: 76 additions & 32 deletions src/main/java/picard/arrays/VcfToAdpc.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFFileReader;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.commons.lang.StringUtils;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
Expand All @@ -42,7 +43,12 @@
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;

Expand All @@ -62,55 +68,86 @@ public class VcfToAdpc extends CommandLineProgram {
"An adpc.bin file is a binary file containing genotyping array intensity data that can be exported " +
"by Illumina's GenomeStudio and Beadstudio analysis tools. The adpc.bin file is used as an input to " +
"<a href='https://genome.sph.umich.edu/wiki/VerifyIDintensity'>VerifyIDintensity</a> a tool for " +
"detecting and estimating sample contamination of Illumina genotyping array data." +
"detecting and estimating sample contamination of Illumina genotyping array data. " +
"If more than one VCF is used, they must all have the same number of loci." +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar VcfToAdpc \\<br />" +
" VCF=input.vcf \\<br />" +
" OUTPUT=output.adpc.bin" +
" OUTPUT=output.adpc.bin \\<br />" +
" SAMPLES_FILE=output.samples.txt \\<br />" +
" NUM_MARKERS_FILE=output.num_markers.txt \\<br />" +
"</pre>";


private final Log log = Log.getInstance(VcfToAdpc.class);

@Argument(doc = "The Input VCF")
public File VCF;
@Argument(doc = "One or more VCF files containing array intensity data.")
public List<File> VCF;

@Argument(shortName = StandardOptionDefinitions.OUTPUT_SHORT_NAME, doc = "The output (adpc.bin) file to write.")
public File OUTPUT;

@Argument(shortName = "SF", doc = "A text file into which the names of the samples will be written. " +
"These will be in the same order as the data in the adpc.bin file.")
public File SAMPLES_FILE;

@Argument(shortName = "NMF", doc = "A text file into which the number of loci in the VCF will be written. " +
"This is useful for calling verifyIDIntensity.")
public File NUM_MARKERS_FILE;

@Override
protected int doWork() {
IOUtil.assertFileIsReadable(VCF);
final List<File> inputs = IOUtil.unrollFiles(VCF, IOUtil.VCF_EXTENSIONS);
IOUtil.assertFilesAreReadable(inputs);
IOUtil.assertFileIsWritable(SAMPLES_FILE);
IOUtil.assertFileIsWritable(NUM_MARKERS_FILE);
IOUtil.assertFileIsWritable(OUTPUT);
VCFFileReader vcfFileReader = new VCFFileReader(VCF, false);
final List<String> sampleNames = new ArrayList<>();

Integer numberOfLoci = null;
try (IlluminaAdpcFileWriter adpcFileWriter = new IlluminaAdpcFileWriter(OUTPUT)) {
for (final File inputVcf : inputs) {
VCFFileReader vcfFileReader = new VCFFileReader(inputVcf, false);
final VCFHeader header = vcfFileReader.getFileHeader();
for (int sampleNumber = 0; sampleNumber < header.getNGenotypeSamples(); sampleNumber++) {
final String sampleName = header.getGenotypeSamples().get(sampleNumber);
sampleNames.add(sampleName);
log.info("Processing sample: " + sampleName + " from VCF: " + inputVcf.getAbsolutePath());

try (CloseableIterator<VariantContext> variants = vcfFileReader.iterator();
IlluminaAdpcFileWriter adpcFileWriter = new IlluminaAdpcFileWriter(OUTPUT)) {
CloseableIterator<VariantContext> variants = vcfFileReader.iterator();
int lociCount = 0;
while (variants.hasNext()) {
final VariantContext context = variants.next();
final float gcScore = getFloatAttribute(context, InfiniumVcfFields.GC_SCORE);

final List<IlluminaAdpcFileWriter.Record> adpcRecordList = new ArrayList<>();
while (variants.hasNext()) {
final VariantContext context = variants.next();
final float gcScore = getFloatAttribute(context, InfiniumVcfFields.GC_SCORE);
final Genotype genotype = context.getGenotype(sampleNumber);
final IlluminaGenotype illuminaGenotype = getIlluminaGenotype(genotype, context);

for (final Genotype genotype : context.getGenotypes()) {
final IlluminaGenotype illuminaGenotype = getIlluminaGenotype(genotype, context);
final int rawXIntensity = getUnsignedShortAttributeAsInt(genotype, InfiniumVcfFields.X);
final int rawYIntensity = getUnsignedShortAttributeAsInt(genotype, InfiniumVcfFields.Y);

final short rawXIntensity = getShortAttribute(genotype, InfiniumVcfFields.X);
final short rawYIntensity = getShortAttribute(genotype, InfiniumVcfFields.Y);
final Float normalizedXIntensity = getFloatAttribute(genotype, InfiniumVcfFields.NORMX);
final Float normalizedYIntensity = getFloatAttribute(genotype, InfiniumVcfFields.NORMY);

final Float normalizedXIntensity = getFloatAttribute(genotype, InfiniumVcfFields.NORMX);
final Float normalizedYIntensity = getFloatAttribute(genotype, InfiniumVcfFields.NORMY);
if ((normalizedXIntensity != null) && (normalizedYIntensity != null)) {
final IlluminaAdpcFileWriter.Record record = new IlluminaAdpcFileWriter.Record(rawXIntensity, rawYIntensity, normalizedXIntensity, normalizedYIntensity, gcScore, illuminaGenotype);
adpcRecordList.add(record);
adpcFileWriter.write(record);
lociCount++;
}
if (lociCount == 0) {
throw new PicardException("Found no records in VCF' " + inputVcf.getAbsolutePath() + "'");
}
if (numberOfLoci == null) {
numberOfLoci = lociCount;
} else {
if (lociCount != numberOfLoci) {
throw new PicardException("VCFs have differing number of loci");
}
}
}
}
if (adpcRecordList.isEmpty()) {
throw new PicardException("No valid records found in VCF!");
}
adpcFileWriter.write(adpcRecordList);
writeTextToFile(SAMPLES_FILE, StringUtils.join(sampleNames, "\n"));
writeTextToFile(NUM_MARKERS_FILE, "" + numberOfLoci);
} catch (Exception e) {
log.error(e);
return 1;
Expand All @@ -119,6 +156,13 @@ protected int doWork() {
return 0;
}

private void writeTextToFile(final File output, final String text) throws IOException {
try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
new FileOutputStream(output), StandardCharsets.UTF_8))) {
writer.write(text);
}
}

private IlluminaGenotype getIlluminaGenotype(final Genotype genotype, final VariantContext context) {
final IlluminaGenotype illuminaGenotype;
if (genotype.isCalled()) {
Expand Down Expand Up @@ -157,16 +201,16 @@ private IlluminaGenotype getIlluminaGenotype(final Genotype genotype, final Vari
return illuminaGenotype;
}

private short getShortAttribute(final Genotype genotype, final String key) {
private int getUnsignedShortAttributeAsInt(final Genotype genotype, final String key) {
final int attributeAsInt = Integer.parseInt(getRequiredAttribute(genotype, key).toString());
final short returnedAttribute;
if (attributeAsInt <= Short.MAX_VALUE) {
returnedAttribute = (short) attributeAsInt;
} else {
log.warn("Value for key " + key + " (" + attributeAsInt + ") is > " + Short.MAX_VALUE + " (truncating it)");
returnedAttribute = Short.MAX_VALUE;
if (attributeAsInt < 0) {
throw new PicardException("Value for key " + key + " (" + attributeAsInt + ") is <= 0! Invalid value for unsigned int");
}
if (attributeAsInt > picard.arrays.illumina.InfiniumDataFile.MAX_UNSIGNED_SHORT) {
log.warn("Value for key " + key + " (" + attributeAsInt + ") is > " + picard.arrays.illumina.InfiniumDataFile.MAX_UNSIGNED_SHORT + " (truncating it)");
return picard.arrays.illumina.InfiniumDataFile.MAX_UNSIGNED_SHORT;
}
return returnedAttribute;
return attributeAsInt;
}

private Float getFloatAttribute(final Genotype genotype, final String key) {
Expand Down
26 changes: 16 additions & 10 deletions src/main/java/picard/arrays/illumina/IlluminaAdpcFileWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@
*/

public class IlluminaAdpcFileWriter implements AutoCloseable {
private final String HEADER = "1234567890123456";

private final DataOutputStream outputStream;

public IlluminaAdpcFileWriter(final File adpcFile) throws IOException {
Expand All @@ -67,44 +69,48 @@ public IlluminaAdpcFileWriter(final File adpcFile) throws IOException {
}

private void writeHeaderData() throws IOException {
outputStream.write("1234567890123456".getBytes());
outputStream.write(HEADER.getBytes());
}

public void write(Iterable<Record> illuminaAdpcRecords) throws IOException {
for (Record illuminaAdpcRecord : illuminaAdpcRecords) {
illuminaAdpcRecord.write(outputStream);
write(illuminaAdpcRecord);
}
}

public void write(Record illuminaAdpcRecord) throws IOException {
illuminaAdpcRecord.write(outputStream);
}

@Override
public void close() throws Exception {
outputStream.close();
}

public static class Record {
final short aIntensity;
final short bIntensity;
final int aIntensity;
final int bIntensity;
final float aNormalizedIntensity;
final float bNormalizedIntensity;
final float gcScore;
final IlluminaGenotype genotype;

public Record(short aIntensity, short bIntensity, float aNormalizedIntensity, float bNormalizedIntensity, float gcScore, IlluminaGenotype genotype) {
public Record(int aIntensity, int bIntensity, Float aNormalizedIntensity, Float bNormalizedIntensity, float gcScore, IlluminaGenotype genotype) {
this.aIntensity = aIntensity;
this.bIntensity = bIntensity;
this.aNormalizedIntensity = aNormalizedIntensity;
this.bNormalizedIntensity = bNormalizedIntensity;
this.aNormalizedIntensity = aNormalizedIntensity != null ? aNormalizedIntensity : Float.NaN;
this.bNormalizedIntensity = bNormalizedIntensity != null ? bNormalizedIntensity : Float.NaN;
this.gcScore = gcScore;
this.genotype = genotype;
}

public void write(final DataOutputStream outputStream) throws IOException {
InfiniumDataFile.writeShort(outputStream, aIntensity);
InfiniumDataFile.writeShort(outputStream, bIntensity);
InfiniumDataFile.writeUnsignedShort(outputStream, aIntensity);
InfiniumDataFile.writeUnsignedShort(outputStream, bIntensity);
InfiniumDataFile.writeFloat(outputStream, aNormalizedIntensity);
InfiniumDataFile.writeFloat(outputStream, bNormalizedIntensity);
InfiniumDataFile.writeFloat(outputStream, gcScore);
InfiniumDataFile.writeShort(outputStream, genotype.value);
InfiniumDataFile.writeUnsignedShort(outputStream, genotype.value);
}
}
}
13 changes: 10 additions & 3 deletions src/main/java/picard/arrays/illumina/InfiniumDataFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
package picard.arrays.illumina;

import org.apache.commons.io.IOUtils;
import picard.PicardException;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
Expand All @@ -38,6 +39,8 @@
*/
public abstract class InfiniumDataFile {

public static final int MAX_UNSIGNED_SHORT = 65535;

private String identifier;
private int numberOfEntries;
private int fileVersion;
Expand Down Expand Up @@ -178,11 +181,15 @@ float parseFloat() throws IOException {
}

/**
* Utility method for writing a short value to an outputStream.
* Utility method for writing an unsigned short value to an outputStream.
* Note that Java has no unsigned short value, so we pass it as an int and size-validate here.
* Writes in Illumina (little-endian) format
*/
static void writeShort(final DataOutputStream outputStream, final short value) throws IOException {
final byte[] byteArray = shortToByteArray(value);
static void writeUnsignedShort(final DataOutputStream outputStream, final int value) throws IOException {
if (value < 0 || value > MAX_UNSIGNED_SHORT) {
throw new PicardException("Value " + value + " is out of range for a unsigned short");
}
final byte[] byteArray = shortToByteArray((short) (value & 0x0000ffff));
outputStream.write(byteArray);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
public class IlluminaAdpcFileWriterTest {

private static final File TEST_DATA_DIR = new File("testdata/picard/arrays/illumina");
private static final File TEST_EXPECTED_ADPC_BIN_FILE = new File(TEST_DATA_DIR, "TestIlluminaAdpcFileWriter.adpc.bin");
private static final File TEST_EXPECTED_ADPC_BIN_FILE = new File(TEST_DATA_DIR, "TestVcfToAdpc.adpc.bin");

@Test
public void testWriteIlluminaAdpcFile() throws Exception {
Expand All @@ -20,11 +20,12 @@ public void testWriteIlluminaAdpcFile() throws Exception {

try (final IlluminaAdpcFileWriter adpcFileWriter = new IlluminaAdpcFileWriter(output)) {
final List<IlluminaAdpcFileWriter.Record> adpcRecordList = new ArrayList<>();
adpcRecordList.add(new IlluminaAdpcFileWriter.Record((short) 11352, (short) 405, 1.444f, 0.088f, 0.705f, IlluminaGenotype.AA));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record((short) 458, (short) 2743, 0.043f, 0.852f, 0.818f, IlluminaGenotype.BB));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record((short) 7548, (short) 303, 1.072f, 0.076f, 0.0f, IlluminaGenotype.NN));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record((short) 7414, (short) 2158, 0.805f, 0.597f, 0.881f, IlluminaGenotype.AB));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record((short) 222, (short) 215, 0.0f, 0.0f, 0.91f, IlluminaGenotype.NN));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record(11352, 405, 1.444f, 0.088f, 0.705f, IlluminaGenotype.AA));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record(458, 2743, 0.043f, 0.852f, 0.818f, IlluminaGenotype.BB));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record(7548, 303, 1.072f, 0.076f, 0.0f, IlluminaGenotype.NN));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record(7414, 2158, 0.805f, 0.597f, 0.881f, IlluminaGenotype.AB));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record(222, 215, 0.0f, 0.0f, 0.91f, IlluminaGenotype.NN));
adpcRecordList.add(new IlluminaAdpcFileWriter.Record(232, 246, null, null, 0.926f, IlluminaGenotype.NN));
adpcFileWriter.write(adpcRecordList);
}
IOUtil.assertFilesEqual(TEST_EXPECTED_ADPC_BIN_FILE, output);
Expand Down
Loading

0 comments on commit 81bef8d

Please sign in to comment.