From e352ffcf9ed6f7d47e6e66e152d01e2fd966d733 Mon Sep 17 00:00:00 2001 From: Vyacheslav Brover Date: Fri, 4 Aug 2023 15:53:54 -0400 Subject: [PATCH] PD-4706 protein match overrides a nucleotide match for point mutations --- alignment.cpp | 1 - amr_report.cpp | 15 ++++--- amrfinder.cpp | 31 +++++++++---- common.cpp | 119 ++++++++++++++++++++++++++++++++++--------------- common.hpp | 47 +++++++++---------- version.txt | 2 +- 6 files changed, 134 insertions(+), 81 deletions(-) diff --git a/alignment.cpp b/alignment.cpp index 5ee49f8..04b24d6 100644 --- a/alignment.cpp +++ b/alignment.cpp @@ -48,7 +48,6 @@ namespace Alignment_sp // AmrMutation - AmrMutation::AmrMutation (size_t pos_arg, const string &geneMutation_arg, const string &class_arg, diff --git a/amr_report.cpp b/amr_report.cpp index 524252b..4743489 100644 --- a/amr_report.cpp +++ b/amr_report.cpp @@ -1041,7 +1041,7 @@ struct BlastAlignment : Alignment const size_t unionStart = min (protStart, dnaStart); const size_t unionStop = max (protStop, dnaStop); if ( ( intersectionStart < intersectionStop - && double (intersectionStop - intersectionStart) / double (protStop - protStart) > 0.75 // PAR + && double (intersectionStop - intersectionStart) / double (protStop - protStart) > 0.75 // PAR, PD-2320 ) || (intersectionStart - unionStart) + (unionStop - intersectionStop) <= 60 * 3 // PAR, PD-4169 || ( protStart <= dnaStart + mismatchTail_aa * 3 @@ -1061,14 +1061,14 @@ struct BlastAlignment : Alignment // Reflexive { if (this == & other) return true; - if ( ! refMutation. empty () - || ! other. refMutation. empty () - ) - return false; if (targetProt == other. targetProt) { if (targetName != other. targetName) return false; + if ( ! refMutation. empty () + || ! other. refMutation. empty () + ) + return false; // PD-807, PD-4277 if ( ! other. insideEq (*this) && ! insideEq (other) @@ -1097,7 +1097,10 @@ struct BlastAlignment : Alignment } } else - { // PD-1902, PD-2139, PD-2313, PD-2320 + { + if (refMutation. empty () != other. refMutation. empty ()) // PD-4698 + return false; + // PD-1902, PD-2139, PD-2313, PD-2320 if (targetProt && ! matchesCds (other)) return false; if (! targetProt && ! other. matchesCds (*this)) diff --git a/amrfinder.cpp b/amrfinder.cpp index 08235fb..384da2c 100644 --- a/amrfinder.cpp +++ b/amrfinder.cpp @@ -33,6 +33,8 @@ * gunzip (optional) * * Release changes: +* 08/04/2023 PD-4698 protein match overrides a nucleotide match for point mutations +* 3.11.18 07/25/2023 parameter order in instruction; "can be gzipped" is added to instruction * 3.11.17 07/19/2023 PD-4687 distinct overlapping hits are not reported separately for protein targets for the same alleles or gene symbols * 3.11.16 07/18/2023 PD-4687 distinct overlapping hits are not reported separately for protein targets (because the start/stop are not reported) * 3.11.15 05/23/2023 PD-4629 "amrfinder_update -d DIR" will create DIR if DIR is missing @@ -298,38 +300,49 @@ struct ThisApplication : ShellApplication { addFlag ("update", "Update the AMRFinder database", 'u'); // PD-2379 addFlag ("force_update", "Force updating the AMRFinder database", 'U'); // PD-3469 + #ifdef DIR addKey ("dir", "Common directory of the --protein, --nucleotide and --gff files", "", '\0', "DIRECTORY"); #endif - addKey ("protein", "Input protein FASTA file", "", 'p', "PROT_FASTA"); - addKey ("nucleotide", "Input nucleotide FASTA file", "", 'n', "NUC_FASTA"); - addKey ("gff", "GFF file for protein locations. Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); - addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); // = --annotation_format pgap + addKey ("protein", "Input protein FASTA file (can be gzipped)", "", 'p', "PROT_FASTA"); + addKey ("nucleotide", "Input nucleotide FASTA file (can be gzipped)", "", 'n', "NUC_FASTA"); + addKey ("gff", "GFF file for protein locations (can be gzipped). Protein id should be in the attribute 'Name=' (9th field) of the rows with type 'CDS' or 'gene' (3rd field).", "", 'g', "GFF_FILE"); addKey ("annotation_format", "Type of GFF file: " + Gff::names. toString (", "), "genbank", 'a', "ANNOTATION_FORMAT"); + addKey ("database", "Alternative directory with AMRFinder database. Default: $AMRFINDER_DB", "", 'd', "DATABASE_DIR"); + addFlag ("database_version", "Print database version", 'V'); + addKey ("ident_min", "Minimum proportion of identical amino acids in alignment for hit (0..1). -1 means use a curated threshold if it exists and " + toString (ident_min_def) + " otherwise", "-1", 'i', "MIN_IDENT"); // "PD-3482 addKey ("coverage_min", "Minimum coverage of the reference protein (0..1)", toString (partial_coverage_min_def), 'c', "MIN_COV"); + addKey ("organism", "Taxonomy group. To see all possible taxonomy groups use the --list_organisms flag", "", 'O', "ORGANISM"); addFlag ("list_organisms", "Print the list of all possible taxonomy groups for mutations identification and exit", 'l'); addKey ("translation_table", "NCBI genetic code for translated BLAST", "11", 't', "TRANSLATION_TABLE"); + addFlag ("plus", "Add the plus genes to the report"); // PD-2789 addFlag ("report_common", "Report proteins common to a taxonomy group"); // PD-2756 - addKey ("mutation_all", "File to report all mutations", "", '\0', "MUT_ALL_FILE"); - addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); - addKey ("hmmer_bin", "Directory for HMMer", "", '\0', "HMMER_DIR"); addFlag ("report_all_equal", "Report all equally-scoring BLAST and HMM matches"); // PD-3772 - addFlag ("print_node", "print hierarchy node (family)"); // PD-4394 addKey ("name", "Text to be added as the first column \"name\" to all rows of the report, for example it can be an assembly name", "", '\0', "NAME"); + addFlag ("print_node", "print hierarchy node (family)"); // PD-4394 + addKey ("mutation_all", "File to report all mutations", "", '\0', "MUT_ALL_FILE"); + addKey ("output", "Write output to OUTPUT_FILE instead of STDOUT", "", 'o', "OUTPUT_FILE"); addKey ("protein_output", "Output protein FASTA file of reported proteins", "", '\0', "PROT_FASTA_OUT"); addKey ("nucleotide_output", "Output nucleotide FASTA file of reported nucleotide sequences", "", '\0', "NUC_FASTA_OUT"); addKey ("nucleotide_flank5_output", "Output nucleotide FASTA file of reported nucleotide sequences with 5' flanking sequences", "", '\0', "NUC_FLANK5_FASTA_OUT"); addKey ("nucleotide_flank5_size", "5' flanking sequence size for NUC_FLANK5_FASTA_OUT", "0", '\0', "NUC_FLANK5_SIZE"); + + addKey ("blast_bin", "Directory for BLAST. Deafult: $BLAST_BIN", "", '\0', "BLAST_DIR"); + addKey ("hmmer_bin", "Directory for HMMer", "", '\0', "HMMER_DIR"); + addFlag ("quiet", "Suppress messages to STDERR", 'q'); + + addFlag ("pgap", "Input files PROT_FASTA, NUC_FASTA and GFF_FILE are created by the NCBI PGAP"); // = --annotation_format pgap addFlag ("gpipe_org", "NCBI internal GPipe organism names"); + addKey ("parm", "amr_report parameters for testing: -nosame -noblast -skip_hmm_check -bed", "", '\0', "PARM"); - addFlag ("database_version", "Print database version", 'V'); + version = SVN_REV; } diff --git a/common.cpp b/common.cpp index d623f45..f6a9533 100644 --- a/common.cpp +++ b/common.cpp @@ -319,8 +319,42 @@ string getStack () bool Chronometer::enabled = false; + + +void Chronometer::start () +{ + if (! on ()) + return; + if (startTime != noclock) + throw runtime_error (FUNC "Chronometer \"" + name + "\" is not stopped"); + startTime = clock (); +} + + + +void Chronometer::stop () +{ + if (! on ()) + return; + if (startTime == noclock) + throw runtime_error (FUNC "Chronometer \"" + name + "\" is not started"); + time += clock () - startTime; + startTime = noclock; +} + + +void Chronometer::print (ostream &os) const +{ + if (! on ()) + return; + os << "CHRON: " << name << ": "; + const ONumber onm (os, 2, false); + os << (double) time / CLOCKS_PER_SEC << " sec." << endl; +} + + // Chronometer_OnePass Chronometer_OnePass::Chronometer_OnePass (const string &name_arg, @@ -439,6 +473,26 @@ const string noString; +string nonPrintable2str (char c) +{ + string s; + switch (c) + { + case 7: s = "BEL"; break; + case 8: s = "BS"; break; + case 9: s = "TAB"; break; + case 10: s = "LF"; break; + case 12: s = "FF"; break; + case 13: s = "CR"; break; + case 27: s = "ESC"; break; + case 32: s = "SPACE"; break; + default: s = "x" + uchar2hex ((uchar) c); + } + return "<" + s + ">"; +} + + + bool isRight (const string &s, const string &right) { @@ -2458,18 +2512,9 @@ Token TokenInput::getXmlText () } // t.name - if (n >= 0 && n < ' ') - { - string s; - switch (n) - { - case 9: s = "TAB"; break; - case 10: s = "LF"; break; - case 13: s = "CR"; break; - default: s = "x" + uchar2hex ((uchar) n); - } - t. name += "<" + s + ">"; - } + QC_ASSERT (n > 0); + if (nonPrintable (n)) + t. name += nonPrintable2str ((char) n); else { if (isChar (n)) @@ -3363,10 +3408,10 @@ void Application::addKey (const string &name, ASSERT (! contains (name2arg, name)); if (acronym && contains (char2arg, acronym)) throw logic_error ("Duplicate option " + strQuote (string (1, acronym))); - keys << Key (*this, name, argDescription, defaultValue, acronym, var); - name2arg [name] = & keys. back (); + keyArgs << Key (*this, name, argDescription, defaultValue, acronym, var); + name2arg [name] = & keyArgs. back (); if (acronym) - char2arg [acronym] = & keys. back (); + char2arg [acronym] = & keyArgs. back (); } @@ -3379,10 +3424,10 @@ void Application::addFlag (const string &name, ASSERT (! contains (name2arg, name)); if (acronym && contains (char2arg, acronym)) throw logic_error ("Duplicate option " + strQuote (string (1, acronym))); - keys << Key (*this, name, argDescription, acronym); - name2arg [name] = & keys. back (); + keyArgs << Key (*this, name, argDescription, acronym); + name2arg [name] = & keyArgs. back (); if (acronym) - char2arg [acronym] = & keys. back (); + char2arg [acronym] = & keyArgs. back (); } @@ -3392,8 +3437,8 @@ void Application::addPositional (const string &name, { ASSERT (! contains (name2arg, name)); IMPLY (gnu, isUpper (name)); - positionals << Positional (name, argDescription); - name2arg [name] = & positionals. back (); + positionalArgs << Positional (name, argDescription); + name2arg [name] = & positionalArgs. back (); } @@ -3417,7 +3462,7 @@ Application::Key* Application::getKey (const string &keyName) const void Application::setPositional (List::iterator &posIt, const string &value) { - if (posIt == positionals. end ()) + if (posIt == positionalArgs. end ()) { if (isLeft (value, "-")) errorExitStr (strQuote (value) + " is not a valid option\n\n" + getInstruction ()); @@ -3472,14 +3517,14 @@ void Application::qc () const QC_ASSERT (! description. empty ()); QC_ASSERT (! version. empty ()); - QC_IMPLY (! needsArg, positionals. empty ()); + QC_IMPLY (! needsArg, positionalArgs. empty ()); - for (const Positional& p : positionals) + for (const Positional& p : positionalArgs) p. qc (); map requiredGroups; string requiredGroup_prev; - for (const Key& key : keys) + for (const Key& key : keyArgs) { key. qc (); if (! key. requiredGroup. empty () && key. requiredGroup != requiredGroup_prev) @@ -3534,11 +3579,11 @@ string Application::getInstruction () const instr += "\n\nUSAGE: " + programName; - for (const Positional& p : positionals) + for (const Positional& p : positionalArgs) instr += " " + p. str (); string requiredGroup_prev; - for (const Key& key : keys) + for (const Key& key : keyArgs) { if (key. requiredGroup == requiredGroup_prev) if (key. requiredGroup. empty ()) @@ -3580,17 +3625,17 @@ string Application::getHelp () const const string par ("\n "); - if (! positionals. empty ()) + if (! positionalArgs. empty ()) { instr += "\n\nPOSITIONAL PARAMETERS:"; - for (const Positional& p : positionals) + for (const Positional& p : positionalArgs) instr += "\n" + p. str () + par + p. description; } - if (! keys. empty ()) + if (! keyArgs. empty ()) { instr += "\n\nNAMED PARAMETERS:"; - for (const Key& key : keys) + for (const Key& key : keyArgs) { instr += "\n" + key. getShortHelp () + par + key. description; if (gnu && ! key. defaultValue. empty ()) @@ -3644,11 +3689,11 @@ int Application::run (int argc, initEnvironment (); - // positionals, keys + // positionalArgs, keyArgs Set keysRead; { bool first = true; - List::iterator posIt = positionals. begin (); + List::iterator posIt = positionalArgs. begin (); Key* key = nullptr; for (string s : programArgs) { @@ -3733,19 +3778,19 @@ int Application::run (int argc, errorExitStr ("Key with no value: " + key->name + "\n\n" + getInstruction ()); - if (programArgs. size () == 1 && (! positionals. empty () || needsArg)) + if (programArgs. size () == 1 && (! positionalArgs. empty () || needsArg)) { cout << getInstruction () << endl; return 1; } - if (posIt != positionals. end ()) + if (posIt != positionalArgs. end ()) errorExitStr ("Too few positional parameters\n\n" + getInstruction ()); } - for (Key& key : keys) + for (Key& key : keyArgs) if (! keysRead. contains (& key)) key. value = key. defaultValue; @@ -3753,7 +3798,7 @@ int Application::run (int argc, { map requiredGroup2names; map requiredGroup2given; - for (const Key& key : keys) + for (const Key& key : keyArgs) if (! key. requiredGroup. empty ()) { if (! key. value. empty ()) @@ -3885,7 +3930,7 @@ void ShellApplication::initEnvironment () string execDir_ (execDir); trimSuffix (execDir_, "/"); - for (Key& key : keys) + for (Key& key : keyArgs) if (! key. flag) replaceStr (key. defaultValue, "$BASE", execDir_); } diff --git a/common.hpp b/common.hpp index 3d11a53..69913bd 100644 --- a/common.hpp +++ b/common.hpp @@ -300,6 +300,14 @@ class OColor +template + inline void report (ostream &os, + const string &name, + T value) + { os << name << '\t' << value << endl; } + + + inline void section (const string &title, bool useAlsoCout) { const Color::Type color = Color::cyan; @@ -344,29 +352,9 @@ struct Chronometer : Nocopy bool on () const { return enabled && threads_max == 1; } - void start () - { if (! on ()) - return; - if (startTime != noclock) - throwf ("Chronometer \"" + name + "\" is not stopped"); - startTime = clock (); - } - void stop () - { if (! on ()) - return; - if (startTime == noclock) - throwf ("Chronometer \"" + name + "\" is not started"); - time += clock () - startTime; - startTime = noclock; - } - - void print (ostream &os) const - { if (! on ()) - return; - os << "CHRON: " << name << ": "; - const ONumber onm (os, 2, false); - os << (double) time / CLOCKS_PER_SEC << " sec." << endl; - } + void start (); + void stop (); + void print (ostream &os) const; }; @@ -580,6 +568,11 @@ inline bool isLower (char c) inline bool printable (char c) { return between (c, ' ', (char) 127); } +inline bool nonPrintable (long c) + { return between (c, '\x001', ' '); } + +string nonPrintable2str (char c); + inline bool isDelimiter (char c) { return c != ' ' && printable (c) @@ -4337,8 +4330,8 @@ struct Application : Singleton, Root const Key* asKey () const final { return this; } }; - List positionals; - List keys; + List positionalArgs; + List keyArgs; map name2arg; map char2arg; // Valid if gnu @@ -4394,7 +4387,7 @@ struct Application : Singleton, Root protected: string getArg (const string &name) const; - // Input: keys, where Key::flag = false, and positionals + // Input: keyArgs, where Key::flag = false, and positionalArgs uint arg2uint (const string &name) const { uint n = 0; if (! str2 (getArg (name), n)) @@ -4408,7 +4401,7 @@ struct Application : Singleton, Root return d; } bool getFlag (const string &name) const; - // Input: keys, where Key::flag = true + // Input: keyArgs, where Key::flag = true string key2shortHelp (const string &name) const; string getProgramDirName () const { return getDirName (programArgs. front ()); } diff --git a/version.txt b/version.txt index 8910aa0..4e4afc1 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -3.11.17 +3.11.18