Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Numerical consistency between python and java, transpose fix #4652

Merged
merged 1 commit into from
Apr 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,15 @@ public final class ReadGroupBlackListReadFilter extends ReadFilter implements Se
// Command line parser requires a no-arg constructor
public ReadGroupBlackListReadFilter() {};

/**
* Creates a filter using the lists of files with blacklisted read groups.
* Any entry can be a path to a file (ending with "list" or "txt" which
* will load blacklist from that file. This scheme works recursively
* (ie the file may contain names of further files etc).
*/
/**
* Creates a filter using the lists of files with blacklisted read groups.
* Any entry can be a path to a file (ending with "list" or "txt" which
* will load blacklist from that file. This scheme works recursively
* (ie the file may contain names of further files etc).
*/
public ReadGroupBlackListReadFilter(final List<String> blackLists, final SAMFileHeader header) {
super.setHeader(header);
this.blackList.addAll(blackLists);
final Map<String, Collection<String>> filters = new TreeMap<>();
for (String blackList : blackLists) {
try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.engine.filters.VariantFilter;
import org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary;
import org.broadinstitute.hellbender.engine.filters.*;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter;
import org.broadinstitute.hellbender.utils.io.Resource;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.python.StreamingPythonScriptExecutor;
Expand Down Expand Up @@ -170,7 +170,7 @@ public class CNNScoreVariants extends VariantWalker {

private int curBatchSize = 0;
private int windowEnd = windowSize / 2;
private int windowStart = (windowSize / 2) - 1;
private int windowStart = windowSize / 2;
private boolean waitforBatchCompletion = false;
private File scoreFile;

Expand All @@ -190,7 +190,6 @@ protected String[] customCommandLineValidation() {
return new String[]{"No default architecture for tensor type:" + tensorType.name()};
}
}

return null;
}

Expand All @@ -208,6 +207,17 @@ protected VariantFilter makeVariantFilter(){
}
}

@Override
public List<ReadFilter> getDefaultReadFilters() {
List<ReadFilter> readFilters = new ArrayList<>();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This list should be initialized with the values returned from super.getDefaultReadFilters to make sure you get the engine defaults like WellFormedReadFilter.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

readFilters.addAll(super.getDefaultReadFilters());
List<String> filterList = new ArrayList<>();
filterList.add("ID:" + HaplotypeBAMWriter.DEFAULT_HAPLOTYPE_READ_GROUP_ID);
filterList.add("ID:" + HaplotypeBAMWriter.DEFAULT_GATK3_HAPLOTYPE_READ_GROUP_ID);
readFilters.add(new ReadGroupBlackListReadFilter(filterList, null));
return readFilters;
}

@Override
public void onTraversalStart() {
scoreKey = getScoreKeyAndCheckModelAndReadsHarmony();
Expand Down Expand Up @@ -319,7 +329,7 @@ private void transferReadsToPythonViaFifo(final VariantContext variant, final Re
}
Iterator<GATKRead> readIt = readsContext.iterator();
if (!readIt.hasNext()) {
logger.warn("No reads at contig:" + variant.getContig() + "site:" + String.valueOf(variant.getStart()));
logger.warn("No reads at contig:" + variant.getContig() + " site:" + String.valueOf(variant.getStart()));
}
while (readIt.hasNext()) {
sb.append(GATKReadToString(readIt.next()));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ public abstract class HaplotypeBAMWriter implements AutoCloseable {
*/
private long uniqueNameCounter = 1;

private static final String DEFAULT_HAPLOTYPE_READ_GROUP_ID = "ArtificialHaplotypeRG";
public static final String DEFAULT_HAPLOTYPE_READ_GROUP_ID = "ArtificialHaplotypeRG";
// For read filters that need backwards compatibility with the GATK3 artificial haplotype
public static final String DEFAULT_GATK3_HAPLOTYPE_READ_GROUP_ID = "ArtificialHaplotype";
private static final int bestHaplotypeMQ = 60;
private static final int otherMQ = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,14 @@ def score_and_write_batch(args, model, file_out, fifo, batch_size, python_batch_
variant_data = []
read_batch = []

for i in range(batch_size):
for _ in range(batch_size):
fifo_line = fifo.readline()
fifo_data = fifo_line.split(defines.SEPARATOR_CHAR)

variant_data.append(fifo_data[0] + '\t' + fifo_data[1] + '\t' + fifo_data[2] + '\t' + fifo_data[3])
reference_batch.append(reference_string_to_tensor(fifo_data[4]))
annotation_batch.append(annotation_string_to_tensor(args, fifo_data[5]))
variant_types.append(fifo_data[6])
variant_types.append(fifo_data[6].strip())

fidx = 7 # 7 Because above we parsed: contig pos ref alt reference_string annotation variant_type
if args.tensor_name in defines.TENSOR_MAPS_2D and len(fifo_data) > fidx:
Expand All @@ -69,7 +69,7 @@ def score_and_write_batch(args, model, file_out, fifo, batch_size, python_batch_
_, ref_start, _ = get_variant_window(args, var)
insert_dict = get_inserts(args, read_tuples, var)
tensor = read_tuples_to_read_tensor(args, read_tuples, ref_start, insert_dict)
reference_sequence_into_tensor(args, fifo_data[4], tensor)
reference_sequence_into_tensor(args, fifo_data[4], tensor, insert_dict)
if os.path.exists(tensor_dir):
_write_tensor_to_hd5(args, tensor, annotation_batch[-1], fifo_data[0], fifo_data[1], fifo_data[6])
read_batch.append(tensor)
Expand Down Expand Up @@ -201,8 +201,8 @@ def cigar_string_to_tuples(cigar):

def get_variant_window(args, variant):
index_offset = (args.window_size//2)
reference_start = variant.pos-(index_offset+1)
reference_end = variant.pos+index_offset
reference_start = variant.pos-index_offset
reference_end = variant.pos + index_offset + (args.window_size%2)
return index_offset, reference_start, reference_end


Expand Down Expand Up @@ -245,6 +245,7 @@ def read_tuples_to_read_tensor(args, read_tuples, ref_start, insert_dict):

if i == args.window_size:
break

if b == defines.SKIP_CHAR:
continue
elif flag_start == -1:
Expand Down Expand Up @@ -290,8 +291,6 @@ def read_tuples_to_read_tensor(args, read_tuples, ref_start, insert_dict):
else:
tensor[channel_map['mapping_quality'], j, flag_start:flag_end] = mq



return tensor


Expand Down Expand Up @@ -335,9 +334,15 @@ def sequence_and_qualities_from_read(args, read, ref_start, insert_dict):
return rseq, rqual


def reference_sequence_into_tensor(args, reference_seq, tensor):
def reference_sequence_into_tensor(args, reference_seq, tensor, insert_dict):
ref_offset = len(set(args.input_symbols.values()))

for i in sorted(insert_dict.keys(), key=int, reverse=True):
if i < 0:
reference_seq = defines.INDEL_CHAR*insert_dict[i] + reference_seq
else:
reference_seq = reference_seq[:i] + defines.INDEL_CHAR*insert_dict[i] + reference_seq[i:]

for i,b in enumerate(reference_seq):
if i == args.window_size:
break
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -607,11 +607,11 @@ def reference_sequence_into_tensor(args, reference_seq, tensor):
else:
tensor[ref_offset+args.input_symbols[b], :, i] = 1.0
elif b in vqsr_cnn.AMBIGUITY_CODES:
ambiguous_vector = np.transpose(np.tile(vqsr_cnn.AMBIGUITY_CODES[b], (args.read_limit, 1)))
ambiguous_vector = np.tile(vqsr_cnn.AMBIGUITY_CODES[b], (args.read_limit, 1))
if K.image_data_format() == 'channels_last':
tensor[:, i, ref_offset:ref_offset+4] = ambiguous_vector
else:
tensor[ref_offset:ref_offset+4, :, i] = ambiguous_vector
tensor[ref_offset:ref_offset+4, :, i] = np.transpose(ambiguous_vector)


def flag_to_array(flag):
Expand Down
Git LFS file not shown
Git LFS file not shown