Skip to content

Commit

Permalink
Merge pull request #110 from martinghunt/rename_clusters
Browse files Browse the repository at this point in the history
Rename clusters
  • Loading branch information
martinghunt authored Jul 28, 2016
2 parents a499fd3 + fe6fce5 commit 1080b37
Show file tree
Hide file tree
Showing 32 changed files with 119 additions and 25 deletions.
5 changes: 1 addition & 4 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,8 @@ def _init_and_run_clusters(self):
counter = 0
cluster_list = []
self.log_files = []
cluster_numbers = [int(x) for x in self.cluster_to_dir.keys()]
cluster_numbers.sort()

for cluster_number in cluster_numbers:
cluster_name = str(cluster_number)
for cluster_name in sorted(self.cluster_to_dir):
counter += 1
if self.verbose:
print('Constructing cluster ', cluster_name, ' (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ int assemble(char *readsFile, char *fastaOut, char* logfileOut)
fml_opt_init(&opt);
opt.max_cnt = 10000;
opt.min_asm_ovlp = 15;
opt.mag_opt.flag |= MAG_F_AGGRESSIVE;
std::vector<unsigned short> minCounts;
minCounts.push_back(4);
minCounts.push_back(8);
Expand Down
File renamed without changes.
53 changes: 53 additions & 0 deletions ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,56 @@ def _write_info_file(self, outfile):
print('genetic_code', self.genetic_code, sep='\t', file=fout)


@staticmethod
def _rename_clusters(clusters_in):
new_clusters = {}
key_count = {}
min_prefix_length = 3

for old_name, name_set in sorted(clusters_in.items()):
names_before_dots = {}
for name in name_set:
if '.' in name:
prefix = name.split('.')[0]
if len(prefix) >= min_prefix_length:
names_before_dots[prefix] = names_before_dots.get(prefix, 0) + 1

if len(names_before_dots) == 0:
new_key = 'cluster'
elif len(names_before_dots) == 1:
new_key = list(names_before_dots.keys())[0]
if sum(list(names_before_dots.values())) < len(name_set):
new_key += '+'
else:
common_prefix = os.path.commonprefix(list(names_before_dots.keys()))
if common_prefix == '' or len(common_prefix) < min_prefix_length:
max_value = max(list(names_before_dots.values()))
possible_keys = [x for x in names_before_dots if names_before_dots[x] == max_value]
possible_keys.sort()
new_key = possible_keys[0] + '+'
else:
new_key = common_prefix + '-'

if new_key in key_count:
if new_key in new_clusters:
assert key_count[new_key] == 1
assert new_key + '_1' not in new_clusters
new_clusters[new_key + '_1'] = new_clusters[new_key]
del new_clusters[new_key]

key_count[new_key] += 1
new_name = new_key + '_' + str(key_count[new_key])
else:
key_count[new_key] = 1
new_name = new_key

assert new_name not in new_clusters
new_clusters[new_name] = name_set


return new_clusters


def run(self, outdir):
original_dir = os.getcwd()

Expand Down Expand Up @@ -101,6 +151,9 @@ def run(self, outdir):
clusters_file=self.clusters_file,
)

clusters = self._rename_clusters(clusters)
reference_data.ReferenceData.write_cluster_allocation_file(clusters, cdhit_outprefix + '.clusters.tsv')

if self.verbose:
print('\nWriting clusters to file.', len(clusters), 'in total', flush=True)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
>contig.0
CATCTAGGTTGGACAGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAGCTGGTGCCAGAGTTGGACTTAGAGCTACAAACTTTAGCCCTATGATCGCGTCAGAGTTAGTCAAGGGAGGGCCACTCAGGGCATTGGGCCAGGCTAAAGCCGCGAG
CTCGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCGCTACCTGCTTAGCCTACGACACGGCAGGAGCCTGCTAGGCCCTGAATAAGTGTCAGCTGATGCGGCTAGCGAAGTACCAACCATGCGCTGAGGTTCAAGGCTGTCCAACCTAGATG
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
First line
Fermilite assembly stats:
Min_count Contig_number Mean_length Longest
4 3 230.333 428
4 1 499 499
8 1 499 499
12 1 428 428
16 1 428 428
20 1 384 384
25 1 303 303
30 1 304 304
Best assembly is from min_count 8
25 1 384 384
30 1 373 373
Best assembly is from min_count 4
2 changes: 1 addition & 1 deletion ariba/tests/data/assembly_run_fermilite.expected.fa
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
>contig.0
CATCTAGGTTGGACAGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAGCTGGTGCCAGAGTTGGACTTAGAGCTACAAACTTTAGCCCTATGATCGCGTCAGAGTTAGTCAAGGGAGGGCCACTCAGGGCATTGGGCCAGGCTAAAGCCGCGAG
CTCGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCGCTACCTGCTTAGCCTACGACACGGCAGGAGCCTGCTAGGCCCTGAATAAGTGTCAGCTGATGCGGCTAGCGAAGTACCAACCATGCGCTGAGGTTCAAGGCTGTCCAACCTAGATG
8 changes: 4 additions & 4 deletions ariba/tests/data/assembly_run_fermilite.expected.log
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Fermilite assembly stats:
Min_count Contig_number Mean_length Longest
4 3 230.333 428
4 1 499 499
8 1 499 499
12 1 428 428
16 1 428 428
20 1 384 384
25 1 303 303
30 1 304 304
Best assembly is from min_count 8
25 1 384 384
30 1 373 373
Best assembly is from min_count 4
10 changes: 5 additions & 5 deletions ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
0 noncoding1 noncoding2.var_only noncoding3.var_only
1 noncoding4.var_only
2 gene1 gene2
3 gene3
4 gene4.var_only
cluster_1 gene1 gene2
cluster_2 gene3
gene4 gene4.var_only
noncoding- noncoding1 noncoding2.var_only noncoding3.var_only
noncoding4 noncoding4.var_only
33 changes: 33 additions & 0 deletions ariba/tests/ref_preparer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,39 @@


class TestRefPreparer(unittest.TestCase):
def test_rename_clusters(self):
'''test _rename_clusters'''
clusters_in = {
'0': {'no_dot_in_name'},
'1': {'another_no_dot_in_name'},
'2': {'foo.blah_blah_blah', 'foo.xyz'},
'3': {'foo.abc', 'foo.def'},
'4': {'pre1.abc', 'pre2.abc'},
'5': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
'6': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
'7': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
'8': {'xyz.1', 'xyz.2', 'abcdefgh'},
'9': {'a.foo', 'a.bar'},
}

expected = {
'cluster_1': {'no_dot_in_name'},
'cluster_2': {'another_no_dot_in_name'},
'foo_1': {'foo.blah_blah_blah', 'foo.xyz'},
'foo_2': {'foo.abc', 'foo.def'},
'pre-_1': {'pre1.abc', 'pre2.abc'},
'pre-_2': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
'prefix1+_1': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
'prefix1+_2': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
'xyz+': {'xyz.1', 'xyz.2', 'abcdefgh'},
'cluster_3': {'a.foo', 'a.bar'},
}

self.maxDiff = None
got = ref_preparer.RefPreparer._rename_clusters(clusters_in)
self.assertEqual(expected, got)


def test_run(self):
'''test run'''
fasta_in = [
Expand Down
2 changes: 1 addition & 1 deletion scripts/ariba
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files',

cdhit_group = subparser_prepareref.add_argument_group('cd-hit options')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. First name in line is the cluster representative. Incompatible with --no_cdhit', metavar='FILENAME')
cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME')
cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')

Expand Down
20 changes: 15 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from setuptools import setup, find_packages, Extension

minimap_c_files = [
'minimap_ariba.cpp',
'bseq.c',
'index.c',
'kthread.c',
Expand All @@ -15,14 +14,19 @@
'sketch.c',
]

minimap_c_files = [os.path.join('third_party', 'minimap', x) for x in minimap_c_files]
minimap_mod = Extension("minimap_ariba", minimap_c_files, extra_link_args=['-lz'])
minimap_c_files = [os.path.join('third_party', 'minimap-0.2', x) for x in minimap_c_files]
minimap_c_files.append(os.path.join('ariba', 'ext', 'minimap_ariba.cpp'))
minimap_mod = Extension(
"minimap_ariba",
minimap_c_files,
extra_link_args=['-lz'],
include_dirs=[os.path.join('third_party', 'minimap-0.2')],
)

fermilite_c_files = [
'bfc.c',
'bseq.c',
'bubble.c',
'fml-asm_ariba.cpp',
'htab.c',
'ksw.c',
'kthread.c',
Expand All @@ -35,7 +39,13 @@
'unitig.c'
]
fermilite_c_files = [os.path.join('third_party', 'fermi-lite-0.1', x) for x in fermilite_c_files]
fermilite_mod = Extension("fermilite_ariba", fermilite_c_files, extra_link_args=['-lz'])
fermilite_c_files.append(os.path.join('ariba', 'ext', 'fml-asm_ariba.cpp'))
fermilite_mod = Extension(
"fermilite_ariba",
fermilite_c_files,
extra_link_args=['-lz'],
include_dirs=[os.path.join('third_party', 'fermi-lite-0.1')],
)


setup(
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 1080b37

Please sign in to comment.