From bcc0d61cf5ac11d304df240fd58c39bd0ee1cd2a Mon Sep 17 00:00:00 2001 From: Brian Ondov Date: Tue, 17 Nov 2015 11:08:10 -0500 Subject: [PATCH] support for billions of references --- src/mash/CommandDistance.cpp | 28 ++++++++++++++-------------- src/mash/CommandDistance.h | 8 ++++---- src/mash/CommandInfo.cpp | 2 +- src/mash/Sketch.cpp | 20 +++++++++++--------- src/mash/Sketch.h | 4 ++-- 5 files changed, 32 insertions(+), 30 deletions(-) diff --git a/src/mash/CommandDistance.cpp b/src/mash/CommandDistance.cpp index 5f59ddd..f4a9f99 100644 --- a/src/mash/CommandDistance.cpp +++ b/src/mash/CommandDistance.cpp @@ -200,7 +200,7 @@ int CommandDistance::run() const sketch.initFromSequence(refArgVector, parameters); - for ( int i = 0; i < sketch.getReferenceCount(); i++ ) + for ( uint64_t i = 0; i < sketch.getReferenceCount(); i++ ) { uint64_t length = sketch.getReference(i).length; @@ -336,16 +336,16 @@ int CommandDistance::run() const void CommandDistance::writeOutput(CompareOutput * output, bool table) const { - int refCount = output->sketchRef.getReferenceCount(); + uint64_t refCount = output->sketchRef.getReferenceCount(); - for ( int i = 0; i < output->sketchQuery->getReferenceCount(); i++ ) + for ( uint64_t i = 0; i < output->sketchQuery->getReferenceCount(); i++ ) { if ( table ) { cout << output->sketchQuery->getReference(i).name; } - for ( int j = 0; j < refCount; j++ ) + for ( uint64_t j = 0; j < refCount; j++ ) { const CompareOutput::PairOutput & pair = output->pairs.at(i * refCount + j); @@ -390,17 +390,17 @@ CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * data) sketchQuery->initFromSequence(fileVector, data->parameters); } - int sketchSize = sketchQuery->getMinHashesPerWindow() < sketchRef.getMinHashesPerWindow() ? + uint64_t sketchSize = sketchQuery->getMinHashesPerWindow() < sketchRef.getMinHashesPerWindow() ? sketchQuery->getMinHashesPerWindow() : sketchRef.getMinHashesPerWindow(); output->pairs.resize(sketchRef.getReferenceCount() * sketchQuery->getReferenceCount()); - for ( int i = 0; i < sketchQuery->getReferenceCount(); i++ ) + for ( uint64_t i = 0; i < sketchQuery->getReferenceCount(); i++ ) { - for ( int j = 0; j < sketchRef.getReferenceCount(); j++ ) + for ( uint64_t j = 0; j < sketchRef.getReferenceCount(); j++ ) { - int pairIndex = i * sketchRef.getReferenceCount() + j; + uint64_t pairIndex = i * sketchRef.getReferenceCount() + j; compareSketches(output->pairs[pairIndex], sketchRef.getReference(j), sketchQuery->getReference(i), sketchSize, sketchRef.getKmerSize(), sketchRef.getKmerSpace(), data->maxDistance, data->maxPValue); } @@ -409,12 +409,12 @@ CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * data) return output; } -void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, int sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue) +void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, uint64_t sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue) { - int i = 0; - int j = 0; - int common = 0; - int denom = 0; + uint64_t i = 0; + uint64_t j = 0; + uint64_t common = 0; + uint64_t denom = 0; const HashList & hashesSortedRef = refRef.hashesSorted; const HashList & hashesSortedQry = refQry.hashesSorted; @@ -495,7 +495,7 @@ void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const output.pass = true; } -double pValue(uint32_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint32_t sketchSize) +double pValue(uint64_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint64_t sketchSize) { if ( x == 0 ) { diff --git a/src/mash/CommandDistance.h b/src/mash/CommandDistance.h index 251a149..0a42f8c 100644 --- a/src/mash/CommandDistance.h +++ b/src/mash/CommandDistance.h @@ -50,8 +50,8 @@ class CommandDistance : public Command struct PairOutput { - int numer; - int denom; + uint64_t numer; + uint64_t denom; double distance; double pValue; bool pass; @@ -73,7 +73,7 @@ class CommandDistance : public Command }; CommandDistance::CompareOutput * compare(CommandDistance::CompareInput * data); -void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, int sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue); -double pValue(uint32_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint32_t sketchSize); +void compareSketches(CommandDistance::CompareOutput::PairOutput & output, const Sketch::Reference & refRef, const Sketch::Reference & refQry, uint64_t sketchSize, int kmerSize, double kmerSpace, double maxDistance, double maxPValue); +double pValue(uint64_t x, uint64_t lengthRef, uint64_t lengthQuery, double kmerSpace, uint64_t sketchSize); #endif diff --git a/src/mash/CommandInfo.cpp b/src/mash/CommandInfo.cpp index 8b8d179..844d8bf 100644 --- a/src/mash/CommandInfo.cpp +++ b/src/mash/CommandInfo.cpp @@ -61,7 +61,7 @@ int CommandInfo::run() const columns[2].push_back("ID"); columns[3].push_back("Comment"); - for ( int i = 0; i < sketch.getReferenceCount(); i++ ) + for ( uint64_t i = 0; i < sketch.getReferenceCount(); i++ ) { const Sketch::Reference & ref = sketch.getReference(i); diff --git a/src/mash/Sketch.cpp b/src/mash/Sketch.cpp index 55ad3bb..f70d6f0 100644 --- a/src/mash/Sketch.cpp +++ b/src/mash/Sketch.cpp @@ -105,11 +105,11 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append) capnp::List::Reader referencesReader = referenceListReader.getReferences(); - int referencesOffset = append ? references.size() : 0; + uint64_t referencesOffset = append ? references.size() : 0; references.resize(referencesOffset + referencesReader.size()); - for ( int i = 0; i < referencesReader.size(); i++ ) + for ( uint64_t i = 0; i < referencesReader.size(); i++ ) { capnp::MinHash::ReferenceList::Reference::Reader referenceReader = referencesReader[i]; @@ -135,7 +135,7 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append) reference.hashesSorted.resize(hashesReader.size()); - for ( int j = 0; j < hashesReader.size(); j++ ) + for ( uint64_t j = 0; j < hashesReader.size(); j++ ) { reference.hashesSorted.set64(j, hashesReader[j]); } @@ -146,7 +146,7 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append) reference.hashesSorted.resize(hashesReader.size()); - for ( int j = 0; j < hashesReader.size(); j++ ) + for ( uint64_t j = 0; j < hashesReader.size(); j++ ) { reference.hashesSorted.set32(j, hashesReader[j]); } @@ -158,14 +158,15 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append) positionHashesByReference.resize(references.size()); - for ( int i = 0; i < lociReader.size(); i++ ) + for ( uint64_t i = 0; i < lociReader.size(); i++ ) { capnp::MinHash::LocusList::Locus::Reader locusReader = lociReader[i]; //cout << locusReader.getHash() << '\t' << locusReader.getSequence() << '\t' << locusReader.getPosition() << endl; positionHashesByReference[locusReader.getSequence() + referencesOffset].push_back(PositionHash(locusReader.getPosition(), locusReader.getHash64())); } - //cout << endl << "References:" << endl << endl; + /* + cout << endl << "References:" << endl << endl; vector< vector > columns(3); @@ -173,15 +174,16 @@ int Sketch::initFromCapnp(const char * file, bool headerOnly, bool append) columns[1].push_back("Length"); columns[2].push_back("Name/Comment"); - for ( int i = 0; i < references.size(); i++ ) + for ( uint64_t i = 0; i < references.size(); i++ ) { columns[0].push_back(to_string(i)); columns[1].push_back(to_string(references[i].length)); columns[2].push_back(references[i].name + " " + references[i].comment); } - //printColumns(columns); - //cout << endl; + printColumns(columns); + cout << endl; + */ /* printf("\nCombined hash table:\n"); diff --git a/src/mash/Sketch.h b/src/mash/Sketch.h index 53dca75..3b4182b 100644 --- a/src/mash/Sketch.h +++ b/src/mash/Sketch.h @@ -125,8 +125,8 @@ class Sketch float getMinHashesPerWindow() const {return parameters.minHashesPerWindow;} int getMinKmerSize(int reference) const; double getRandomKmerChance(int reference) const; - const Reference & getReference(int index) const {return references.at(index);} - int getReferenceCount() const {return references.size();} + const Reference & getReference(uint64_t index) const {return references.at(index);} + uint64_t getReferenceCount() const {return references.size();} int getReferenceIndex(std::string id) const; int getKmerSize() const {return parameters.kmerSize;} double getKmerSpace() const {return kmerSpace;}