Skip to content

Commit

Permalink
Merge pull request #140 from martinghunt/remove_cdhit_2d
Browse files Browse the repository at this point in the history
Remove cdhit 2d
  • Loading branch information
martinghunt authored Sep 30, 2016
2 parents 49e8da3 + 9a053c9 commit 9bdceb8
Show file tree
Hide file tree
Showing 7 changed files with 72 additions and 41 deletions.
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ to the following dependencies.
|----------------|------------------------|---------------------------|
| Bowtie2 | `bowtie2` | `$ARIBA_BOWTIE2` |
| CD-HIT (est) | `cd-hit-est` | `$ARIBA_CDHIT` |
| CD-HIT (est-2d)| `cd-hit-est-2d` | `$ARIBA_CDHIT2D` |
| MASH | `mash` | `$ARIBA_MASH` |


Expand Down
6 changes: 2 additions & 4 deletions ariba/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import shutil
import sys
import pyfastaq
from ariba import assembly, assembly_compare, assembly_variants, external_progs, flag, mapping, read_filter, report, samtools_variants
from ariba import assembly, assembly_compare, assembly_variants, external_progs, flag, mapping, report, samtools_variants

class Error (Exception): pass

Expand Down Expand Up @@ -174,9 +174,7 @@ def _set_up_input_files(self):

self.refdata.write_seqs_to_fasta(self.references_fa, self.reference_names)
self.log_fh = pyfastaq.utils.open_file_write(self.logfile)
self.read_store.get_reads(self.name, self.all_reads1, self.all_reads2)
rfilter = read_filter.ReadFilter(self.read_store, self.references_fa, self.name, self.log_fh, self.extern_progs)
self.total_reads, self.total_reads_bases = rfilter.run(self.all_reads1, self.all_reads2)
self.total_reads, self.total_reads_bases = self.read_store.get_reads(self.name, self.all_reads1, self.all_reads2, log_fh=self.log_fh)
self.refdata.write_seqs_to_fasta(self.references_fa, self.reference_names)

self.longest_ref_length = max([len(self.refdata.sequence(name)) for name in self.reference_names])
Expand Down
51 changes: 43 additions & 8 deletions ariba/ext/minimap_ariba.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ void chooseCluster(std::string outfile, std::map<std::string, uint64_t>& refname
void writeClusterCountsFile(std::string outfile, const std::map<std::string, uint64_t>& readCounters, const std::map<std::string, uint64_t>& baseCounters);
void writeInsertHistogramFile(std::string outfile, const std::map<uint32_t, uint32_t>& insertHist);
void writeProperPairsFile(std::string outfile, uint32_t properPairs);
bool readMappingOk(const mm_reg1_t* r, const mm_idx_t* mi, const kseq_t *ks1, uint32_t endTolerance);

int run_minimap(char *clustersFileIn, char *refFileIn, char *readsFile1In, char *readsFile2In, char *outprefixIn);

Expand Down Expand Up @@ -143,18 +144,24 @@ int run_minimap(char *clustersFileIn, char *refFileIn, char *readsFile1In, char
for (j =0; j < n_reg1; ++j)
{
const mm_reg1_t *r = &reg1[j];
refnames.insert(mi->name[r->rid]);
refnameToScore[mi->name[r->rid]] += r->cnt;
uint32_t coord = r->rev ? std::max(r->rs, r->re) : std::min(r->rs, r->re);
positions1[mi->name[r->rid]].push_back(std::make_pair(coord, r->rev));
if (readMappingOk(r, mi, ks1, (int) 1.1 * k))
{
refnames.insert(mi->name[r->rid]);
refnameToScore[mi->name[r->rid]] += r->cnt;
uint32_t coord = r->rev ? std::max(r->rs, r->re) : std::min(r->rs, r->re);
positions1[mi->name[r->rid]].push_back(std::make_pair(coord, r->rev));
}
}
for (j =0; j < n_reg2; ++j)
{
const mm_reg1_t *r = &reg2[j];
refnames.insert(mi->name[r->rid]);
refnameToScore[mi->name[r->rid]] += r->cnt;
uint32_t coord = r->rev ? std::max(r->rs, r->re) : std::min(r->rs, r->re);
positions2[mi->name[r->rid]].push_back(std::make_pair(coord, r->rev));
if (readMappingOk(r, mi, ks2, (int) 1.1 * k))
{
refnames.insert(mi->name[r->rid]);
refnameToScore[mi->name[r->rid]] += r->cnt;
uint32_t coord = r->rev ? std::max(r->rs, r->re) : std::min(r->rs, r->re);
positions2[mi->name[r->rid]].push_back(std::make_pair(coord, r->rev));
}
}

bool foundProperPair = false;
Expand Down Expand Up @@ -349,3 +356,31 @@ void writeProperPairsFile(std::string outfile, uint32_t properPairs)
ofs << properPairs << '\n';
ofs.close();
}


bool readMappingOk(const mm_reg1_t* r, const mm_idx_t* mi, const kseq_t *ks, uint32_t endTolerance)
{
// coords are same style as python (0-based, end is one past the end)
assert (r->qs < r->qe && r->rs < r->re);

if (r->qe - r->qs < std::min((unsigned) 50, (int) 0.5 * ks->seq.l))
{
return false;
}

uint32_t refLength = mi->len[r->rid];
bool startOk;
bool endOk;
if (r->rev)
{
startOk = (r->qs < endTolerance || refLength - r->re < endTolerance);
endOk = (ks->seq.l - r->qe < endTolerance || r->rs < endTolerance);
}
else
{
startOk = (r->qs < endTolerance || r->rs < endTolerance);
endOk = (ks->seq.l - r->qe < endTolerance || refLength - r->re < endTolerance);
}

return (startOk && endOk);
}
6 changes: 3 additions & 3 deletions ariba/external_progs.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class Error (Exception): pass
prog_to_default = {
'bowtie2': 'bowtie2',
'cdhit': 'cd-hit-est',
'cdhit2d': 'cd-hit-est-2d',
#'cdhit2d': 'cd-hit-est-2d',
#'gapfiller': 'GapFiller.pl',
'mash': 'mash',
'nucmer' : 'nucmer',
Expand All @@ -27,7 +27,7 @@ class Error (Exception): pass
prog_to_version_cmd = {
'bowtie2': ('--version', re.compile('.*bowtie2.*version (.*)$')),
'cdhit': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
'cdhit2d': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
#'cdhit2d': ('', re.compile('CD-HIT version ([0-9\.]+) \(')),
#'gapfiller': ('', re.compile('^Usage: .*pl \[GapFiller_(.*)\]')),
'mash': ('', re.compile('^Mash version (.*)$')),
'nucmer': ('--version', re.compile('^NUCmer \(NUCleotide MUMmer\) version ([0-9\.]+)')),
Expand All @@ -39,7 +39,7 @@ class Error (Exception): pass
min_versions = {
'bowtie2': '2.1.0',
'cdhit': '4.6',
'cdhit2d': '4.6',
#'cdhit2d': '4.6',
'mash': '1.0.2',
'nucmer': '3.1',
#'spades': '3.5.0',
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
cluster1 1628 123728
cluster2 1952 148352
cluster1 1624 123424
cluster2 1946 147896
37 changes: 16 additions & 21 deletions ariba/tests/data/clusters_minimap_reads_to_all_refs.out.hist
Original file line number Diff line number Diff line change
@@ -1,31 +1,26 @@
191 1
194 3
196 4
201 3
202 3
203 2
204 4
209 8
210 3
194 2
204 1
209 5
210 2
211 1
212 19
213 30
214 43
212 15
213 29
214 42
215 58
216 83
216 80
217 100
218 116
218 114
219 123
220 151
221 170
222 128
220 150
221 168
222 127
223 144
224 108
225 131
224 105
225 128
226 103
227 79
227 78
228 79
229 45
229 44
230 22
231 16
232 1
8 changes: 6 additions & 2 deletions ariba/tests/read_filter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ def setUp(self):
self.external_progs = external_progs.ExternalProgs()


def test_run_cdhit_est_2d(self):
# skip this, as no longer using cdhit2d, but leave it here in case we want
# to put it back in at a later date
def _test_run_cdhit_est_2d(self):
'''test _run_cdhit_est_2d'''
reads_in = os.path.join(data_dir, 'read_filter_test_run_cdhit_est_2d.reads.in.fa')
ref_in = os.path.join(data_dir, 'read_filter_test_run_cdhit_est_2d.ref.in.fa')
Expand All @@ -33,7 +35,9 @@ def test_cdhit_clstr_to_reads(self):
self.assertEqual(expected, got)


def test_run(self):
# skip this, as no longer using cdhit2d, but leave it here in case we want
# to put it back in at a later date
def _test_run(self):
'''test run'''
rstore_infile = os.path.join(data_dir, 'read_filter_test_run.in.read_store')
ref_fasta = os.path.join(data_dir, 'read_filter_test_run.in.ref.fa')
Expand Down

0 comments on commit 9bdceb8

Please sign in to comment.