Skip to content

Commit

Permalink
Ported CompareGtcFiles from Picard private repo (#1468)
Browse files Browse the repository at this point in the history
Made some classes implement AutoCloseable
  • Loading branch information
gbggrant authored Mar 16, 2020
1 parent 5e73783 commit b3b8fcc
Show file tree
Hide file tree
Showing 4 changed files with 287 additions and 40 deletions.
78 changes: 40 additions & 38 deletions src/main/java/picard/arrays/GtcToVcf.java
Original file line number Diff line number Diff line change
Expand Up @@ -154,16 +154,6 @@ public class GtcToVcf extends CommandLineProgram {

static final List<Allele> NO_CALL_ALLELES = Collections.unmodifiableList(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));

// This file gets initialized during customCommandLineValidation.
// It is a static member so we don't have to parse the file twice.
private static InfiniumGTCFile infiniumGTCFile;

private static InfiniumEGTFile infiniumEGTFile;

private static String gtcGender = null;

private static Sex fingerprintGender;

private static ReferenceSequenceFile refSeq;

private static final DecimalFormat df = new DecimalFormat();
Expand All @@ -182,27 +172,36 @@ protected boolean requiresReference() {

@Override
protected int doWork() {
final Build37ExtendedIlluminaManifest manifest = setupAndGetManifest();
final InfiniumNormalizationManifest infiniumNormalizationManifest = new InfiniumNormalizationManifest(ILLUMINA_NORMALIZATION_MANIFEST);

Sex fingerprintSex = getFingerprintSex(FINGERPRINT_GENOTYPES_VCF_FILE);
String gtcGender = getGenderFromGtcFile(GENDER_GTC, infiniumNormalizationManifest);
try (InfiniumGTCFile infiniumGTCFile = new InfiniumGTCFile(new DataInputStream(new FileInputStream(INPUT)), infiniumNormalizationManifest);
InfiniumEGTFile infiniumEGTFile = new InfiniumEGTFile(CLUSTER_FILE)) {
final Build37ExtendedIlluminaManifest manifest = setupAndGetManifest(infiniumGTCFile);

final VCFHeader vcfHeader = createVCFHeader(manifest, infiniumGTCFile, gtcGender, CLUSTER_FILE,
REFERENCE_SEQUENCE, refSeq.getSequenceDictionary());
final VCFHeader vcfHeader = createVCFHeader(manifest, infiniumGTCFile, gtcGender, fingerprintSex, CLUSTER_FILE,
REFERENCE_SEQUENCE, refSeq.getSequenceDictionary());

// Setup a collection that will sort contexts properly
// Necessary because input GTC file is not sorted
final SortingCollection<VariantContext> contexts =
SortingCollection.newInstance(
VariantContext.class,
new VCFRecordCodec(vcfHeader),
new VariantContextComparator(refSeq.getSequenceDictionary()),
MAX_RECORDS_IN_RAM,
TMP_DIR.stream().map(File::toPath).toArray(Path[]::new));
// Setup a collection that will sort contexts properly
// Necessary because input GTC file is not sorted
final SortingCollection<VariantContext> contexts =
SortingCollection.newInstance(
VariantContext.class,
new VCFRecordCodec(vcfHeader),
new VariantContextComparator(refSeq.getSequenceDictionary()),
MAX_RECORDS_IN_RAM,
TMP_DIR.stream().map(File::toPath).toArray(Path[]::new));

// fill the sorting collection
fillContexts(contexts, infiniumGTCFile, manifest, infiniumEGTFile);
// fill the sorting collection
fillContexts(contexts, infiniumGTCFile, manifest, infiniumEGTFile);

writeVcf(contexts, OUTPUT, refSeq.getSequenceDictionary(), vcfHeader);
writeVcf(contexts, OUTPUT, refSeq.getSequenceDictionary(), vcfHeader);

return 0;
return 0;
} catch (IOException e) {
throw new PicardException("Error processing GTC File: " + INPUT.getAbsolutePath(), e);
}
}

@Override
Expand All @@ -228,21 +227,11 @@ protected String[] customCommandLineValidation() {
return super.customCommandLineValidation();
}

private Build37ExtendedIlluminaManifest setupAndGetManifest() {
fingerprintGender = getFingerprintSex(FINGERPRINT_GENOTYPES_VCF_FILE);
final InfiniumNormalizationManifest infiniumNormalizationManifest = new InfiniumNormalizationManifest(ILLUMINA_NORMALIZATION_MANIFEST);
try (final DataInputStream gtcInputStream = new DataInputStream(new FileInputStream(INPUT))) {
private Build37ExtendedIlluminaManifest setupAndGetManifest(InfiniumGTCFile infiniumGTCFile) {

infiniumEGTFile = new InfiniumEGTFile(CLUSTER_FILE);
infiniumGTCFile = new InfiniumGTCFile(gtcInputStream, infiniumNormalizationManifest);
try {
final Build37ExtendedIlluminaManifest manifest = new Build37ExtendedIlluminaManifest(EXTENDED_ILLUMINA_MANIFEST);

if (GENDER_GTC != null) {
try (DataInputStream genderGtcStream = new DataInputStream(new FileInputStream(GENDER_GTC))) {
gtcGender = new InfiniumGTCFile(genderGtcStream, infiniumNormalizationManifest).getGender();
}
}

final String gtcManifestName = FilenameUtils.removeExtension(infiniumGTCFile.getSnpManifest());
final String illuminaManifestName = FilenameUtils.removeExtension(manifest.getDescriptorFileName());

Expand Down Expand Up @@ -274,6 +263,18 @@ static Sex getFingerprintSex(final File file) {
return Sex.Unknown;
}

private String getGenderFromGtcFile(final File gtcFile, final InfiniumNormalizationManifest infiniumNormalizationManifest) {
String gtcGender = null;
if (gtcFile != null) {
try (InfiniumGTCFile infiniumGTCGenderFile = new InfiniumGTCFile(new DataInputStream(new FileInputStream(gtcFile)), infiniumNormalizationManifest)) {
gtcGender = infiniumGTCGenderFile.getGender();
} catch (IOException e) {
throw new PicardException("Error processing GTC File: " + gtcFile.getAbsolutePath(), e);
}
}
return gtcGender;
}

private void fillContexts(final SortingCollection<VariantContext> contexts, final InfiniumGTCFile gtcFile,
final Build37ExtendedIlluminaManifest manifest, final InfiniumEGTFile egtFile) {
final ProgressLogger progressLogger = new ProgressLogger(log, 100000, "sorted");
Expand Down Expand Up @@ -514,6 +515,7 @@ private void writeVcf(final SortingCollection<VariantContext> variants,
private VCFHeader createVCFHeader(final Build37ExtendedIlluminaManifest manifest,
final InfiniumGTCFile gtcFile,
final String gtcGender,
final Sex fingerprintGender,
final File clusterFile,
final File reference,
final SAMSequenceDictionary dict) {
Expand Down
235 changes: 235 additions & 0 deletions src/main/java/picard/arrays/illumina/CompareGtcFiles.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,235 @@
package picard.arrays.illumina;

import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.Log;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import picard.PicardException;
import picard.cmdline.CommandLineProgram;
import picard.cmdline.StandardOptionDefinitions;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.lang.reflect.Array;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

/**
* A simple tool to compare two Illumina GTC files.
*/

@CommandLineProgramProperties(
summary = CompareGtcFiles.USAGE_DETAILS,
oneLineSummary = "Compares two GTC files.",
programGroup = picard.cmdline.programgroups.GenotypingArraysProgramGroup.class
)
public class CompareGtcFiles extends CommandLineProgram {

static final String USAGE_DETAILS =
"CompareGtcFiles takes two Illumina GTC file and compares their contents to ensure that fields expected to be the same " +
"are in fact the same. This will exclude any variable field, such as a date. " +
"The GTC files must be generated on the same chip type. " +
"<h4>Usage example:</h4>" +
"<pre>" +
"java -jar picard.jar CompareGtcFiles \\<br />" +
" INPUT=input1.gtc \\<br />" +
" INPUT=input2.gtc \\<br />" +
" ILLUMINA_NORMALIZATION_MANIFEST=chip_name.bpm.csv \\<br />" +
"</pre>";

private static final Log log = Log.getInstance(CompareGtcFiles.class);

@Argument(shortName = StandardOptionDefinitions.INPUT_SHORT_NAME,
doc = "GTC input files to compare.",
minElements = 2,
maxElements = 2)
public List<File> INPUT;

@Argument(shortName = "NORM_MANIFEST", doc = "An Illumina bead pool manifest (a manifest containing the Illumina normalization ids) (bpm.csv)")
public File ILLUMINA_NORMALIZATION_MANIFEST;

private final List<String> errors = new ArrayList<>();

// Ignored methods
private static final List<String> IGNORED_METHODS = Arrays.asList(
"getClass",
"getAutocallDate",
"getImagingDate",
// This is the number of TOC entries. It will be different with different versions.
"getNumberOfEntries",
// We don't inject these in our gtcs so they will always be blank and so we don't bother comparing.
"getSampleName",
"getSamplePlate",
"getSampleWell");

@Override
protected int doWork() {
IOUtil.assertFilesAreReadable(INPUT);

InfiniumNormalizationManifest infiniumNormalizationManifest = new InfiniumNormalizationManifest(ILLUMINA_NORMALIZATION_MANIFEST);
try (InfiniumGTCFile gtcFileOne = new InfiniumGTCFile(new DataInputStream(new FileInputStream(INPUT.get(0))), infiniumNormalizationManifest);
InfiniumGTCFile gtcFileTwo = new InfiniumGTCFile(new DataInputStream(new FileInputStream(INPUT.get(1))), infiniumNormalizationManifest)) {
compareGTCFiles(gtcFileOne, gtcFileTwo);

// Report errors and exit 1 if any are detected.
if (!errors.isEmpty()) {
for (String error : errors) {
log.error(error);
}
return 1;
}
} catch (Exception e) {
throw new PicardException("File error: ", e);
}
return 0;
}

private void compareGTCFiles(InfiniumGTCFile gtcFileOne, InfiniumGTCFile gtcFileTwo) throws InvocationTargetException, IllegalAccessException {
// Compare all fields we expect won't change.
Method[] methods = gtcFileOne.getClass().getMethods();
for (Method method : methods) {
// Skip ignored methods.
if (IGNORED_METHODS.contains(method.getName())) {
continue;
}
// Compare all getters
if (method.getName().startsWith("get") && method.getGenericParameterTypes().length == 0) {
// If we have a version and they don't match we just want a warning
// If getter returns an array compare all array values otherwise do an Object compare.
// If getter returns an array of arrays do deep compare
if (method.getName().equals("getFileVersion")) {
compareVersions(method.invoke(gtcFileOne), method.invoke(gtcFileTwo));
} else if (method.getReturnType().isArray() && method.getReturnType().getComponentType().isArray()) {
compareArrayOfArrays(method.invoke(gtcFileOne), method.invoke(gtcFileTwo),
method.getName());
} else if (method.getReturnType().isArray()) {
compareArrays(method.invoke(gtcFileOne), method.invoke(gtcFileTwo),
method.getName());
} else {
compare(method.invoke(gtcFileOne),
method.invoke(gtcFileTwo), method.getName());
}
}
}
}

private void compareVersions(Object versionOne, Object versionTwo) {
if (!versionOne.equals(versionTwo)) {
log.warn(String.format("File versions do not match ( %s vs %s )",
versionOne, versionTwo));
}
}

private void compare(Object objectOne, Object objectTwo, String methodName) {
if (checkNulls(objectOne, objectTwo, methodName)) return;

List<String> compareErrors = new ArrayList<>();

if (!objectOne.equals(objectTwo)) {
compareErrors.add(String.format("%s does not match ( %s vs %s )",
methodName, objectOne, objectTwo));
}
checkErrors(methodName, compareErrors);
}

private void compareArrays(Object arrayOne, Object arrayTwo, String methodName) {
if (checkNulls(arrayOne, arrayTwo, methodName)) return;

List<String> compareErrors = new ArrayList<>();
int differences = arrayDifferences(arrayOne, arrayTwo, methodName, compareErrors);
if (differences > 0) {
compareErrors.add(String.format("%s do not match. %d elements of the array differ.", methodName, differences));
}
checkErrors(methodName, compareErrors);
}

private int arrayDifferences(Object arrayOne, Object arrayTwo, String methodName, List<String> compareErrors) {
int length1 = Array.getLength(arrayOne);
int length2 = Array.getLength(arrayTwo);

int diffCount = 0;
if (length1 != length2) {
compareErrors.add(String.format("%s do not match. Arrays of different lengths. ( %d vs %d )",
methodName, length1, length2));
} else {
for (int i = 0; i < length1; i++) {
// For floats only compare 3 decimal places
if (arrayOne.getClass().getComponentType() == float.class) {
Float float1 = (float) Array.get(arrayOne, i);
Float float2 = (float) Array.get(arrayTwo, i);
if (float1.equals(Float.NaN) || float2.equals(Float.NaN)) {
if (!float1.equals(float2)) diffCount++;
} else {
BigDecimal decimal1 = BigDecimal.valueOf(float1).setScale(3, BigDecimal.ROUND_DOWN);
BigDecimal decimal2 = BigDecimal.valueOf(float2).setScale(3, BigDecimal.ROUND_DOWN);
if (!decimal1.equals(decimal2)) {
diffCount++;
}
}
} else if (!Array.get(arrayOne, i).equals(Array.get(arrayTwo, i))) {
diffCount++;
}
}
}
return diffCount;
}


private void compareArrayOfArrays(Object arrayOfArraysOne, Object arrayOfArraysTwo, String methodName) {
List<String> compareErrors = new ArrayList<>();

if (checkNulls(arrayOfArraysOne, arrayOfArraysTwo, methodName)) return;

int differences = 0;
int length1 = Array.getLength(arrayOfArraysOne);
int length2 = Array.getLength(arrayOfArraysTwo);
if (length1 != length2) {
compareErrors.add(String.format("%s do not match. Arrays of different lengths. ( %d vs %d )",
methodName, length1, length2));
} else {
// Iterate over the first array
for (int i = 0; i < length1; i++) {
// Iterate over the second array
Object innerArrayOne = Array.get(arrayOfArraysOne, i);
Object innerArrayTwo = Array.get(arrayOfArraysTwo, i);
differences += arrayDifferences(innerArrayOne, innerArrayTwo, methodName, compareErrors);
}
if (differences > 0) {
compareErrors.add(String.format("%s do not match. %d elements of the array differ.", methodName, differences));
}
}
checkErrors(methodName, compareErrors);
}

private void checkErrors(String methodName, List<String> compareErrors) {
if (compareErrors.size() > 0) {
errors.addAll(compareErrors);
} else {
log.info(methodName + " IDENTICAL");
}
}

private boolean checkNulls(Object objectOne, Object objectTwo, String methodName) {
if (objectOne == null && objectTwo == null) {
log.warn(String.format("Field %s is missing from both files. Strange.", methodName));
return true;
}
// If one is null we assume a version mismatch
if (objectOne == null || objectTwo == null) {
log.warn(String.format("Field %s is missing from one of the files. Version mismatch likely", methodName));
return true;
}
// If they are primitives check for 0 instead of null
if ((objectOne.equals(0) ^ objectTwo.equals(0)) || (objectOne.equals(0.0f) ^ objectTwo.equals(0.0f))) {
log.warn(String.format("Field %s is not in both files. Version mismatch likely", methodName));
return true;
}
return false;
}

}
7 changes: 6 additions & 1 deletion src/main/java/picard/arrays/illumina/InfiniumEGTFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
* A cluster file contains information about the clustering information used in mapping red / green intensity information
* to genotype calls
*/
public class InfiniumEGTFile extends InfiniumDataFile {
public class InfiniumEGTFile extends InfiniumDataFile implements AutoCloseable {
public static final String EXTENSION = "egt";

private static final int VALID_GENTRAIN_DATA_TYPE = 9;
Expand Down Expand Up @@ -67,6 +67,11 @@ public InfiniumEGTFile(final File clusterFile) throws IOException {
parse();
}

@Override
public void close() throws IOException {
stream.close();
}

private void parse() throws IOException {
try {
readHeaderData();
Expand Down
7 changes: 6 additions & 1 deletion src/main/java/picard/arrays/illumina/InfiniumGTCFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
* This class will parse the binary GTC file format and allow access to the genotype, scores, basecalls and raw
* intensities.
*/
public class InfiniumGTCFile extends InfiniumDataFile {
public class InfiniumGTCFile extends InfiniumDataFile implements AutoCloseable {

private static final int NUM_SNPS = 1;
private static final int PLOIDY = 2;
Expand Down Expand Up @@ -158,6 +158,11 @@ public InfiniumGTCFile(final DataInputStream gtcStream, final InfiniumNormalizat
normalizeAndCalculateStatistics();
}

@Override
public void close() throws IOException {
stream.close();
}

InfiniumGTCFile(final DataInputStream gtcStream) throws IOException {
super(gtcStream, true);
this.normalizationManifest = null;
Expand Down

0 comments on commit b3b8fcc

Please sign in to comment.