Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename clusters #110

Merged
merged 10 commits into from
Jul 28, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions ariba/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,8 @@ def _init_and_run_clusters(self):
counter = 0
cluster_list = []
self.log_files = []
cluster_numbers = [int(x) for x in self.cluster_to_dir.keys()]
cluster_numbers.sort()

for cluster_number in cluster_numbers:
cluster_name = str(cluster_number)
for cluster_name in sorted(self.cluster_to_dir):
counter += 1
if self.verbose:
print('Constructing cluster ', cluster_name, ' (', counter, ' of ', len(self.cluster_to_dir), ')', sep='')
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ int assemble(char *readsFile, char *fastaOut, char* logfileOut)
fml_opt_init(&opt);
opt.max_cnt = 10000;
opt.min_asm_ovlp = 15;
opt.mag_opt.flag |= MAG_F_AGGRESSIVE;
std::vector<unsigned short> minCounts;
minCounts.push_back(4);
minCounts.push_back(8);
Expand Down
File renamed without changes.
53 changes: 53 additions & 0 deletions ariba/ref_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,56 @@ def _write_info_file(self, outfile):
print('genetic_code', self.genetic_code, sep='\t', file=fout)


@staticmethod
def _rename_clusters(clusters_in):
new_clusters = {}
key_count = {}
min_prefix_length = 3

for old_name, name_set in sorted(clusters_in.items()):
names_before_dots = {}
for name in name_set:
if '.' in name:
prefix = name.split('.')[0]
if len(prefix) >= min_prefix_length:
names_before_dots[prefix] = names_before_dots.get(prefix, 0) + 1

if len(names_before_dots) == 0:
new_key = 'cluster'
elif len(names_before_dots) == 1:
new_key = list(names_before_dots.keys())[0]
if sum(list(names_before_dots.values())) < len(name_set):
new_key += '+'
else:
common_prefix = os.path.commonprefix(list(names_before_dots.keys()))
if common_prefix == '' or len(common_prefix) < min_prefix_length:
max_value = max(list(names_before_dots.values()))
possible_keys = [x for x in names_before_dots if names_before_dots[x] == max_value]
possible_keys.sort()
new_key = possible_keys[0] + '+'
else:
new_key = common_prefix + '-'

if new_key in key_count:
if new_key in new_clusters:
assert key_count[new_key] == 1
assert new_key + '_1' not in new_clusters
new_clusters[new_key + '_1'] = new_clusters[new_key]
del new_clusters[new_key]

key_count[new_key] += 1
new_name = new_key + '_' + str(key_count[new_key])
else:
key_count[new_key] = 1
new_name = new_key

assert new_name not in new_clusters
new_clusters[new_name] = name_set


return new_clusters


def run(self, outdir):
original_dir = os.getcwd()

Expand Down Expand Up @@ -101,6 +151,9 @@ def run(self, outdir):
clusters_file=self.clusters_file,
)

clusters = self._rename_clusters(clusters)
reference_data.ReferenceData.write_cluster_allocation_file(clusters, cdhit_outprefix + '.clusters.tsv')

if self.verbose:
print('\nWriting clusters to file.', len(clusters), 'in total', flush=True)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
>contig.0
CATCTAGGTTGGACAGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAGCTGGTGCCAGAGTTGGACTTAGAGCTACAAACTTTAGCCCTATGATCGCGTCAGAGTTAGTCAAGGGAGGGCCACTCAGGGCATTGGGCCAGGCTAAAGCCGCGAG
CTCGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCGCTACCTGCTTAGCCTACGACACGGCAGGAGCCTGCTAGGCCCTGAATAAGTGTCAGCTGATGCGGCTAGCGAAGTACCAACCATGCGCTGAGGTTCAAGGCTGTCCAACCTAGATG
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
First line
Fermilite assembly stats:
Min_count Contig_number Mean_length Longest
4 3 230.333 428
4 1 499 499
8 1 499 499
12 1 428 428
16 1 428 428
20 1 384 384
25 1 303 303
30 1 304 304
Best assembly is from min_count 8
25 1 384 384
30 1 373 373
Best assembly is from min_count 4
2 changes: 1 addition & 1 deletion ariba/tests/data/assembly_run_fermilite.expected.fa
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
>contig.0
CATCTAGGTTGGACAGCCTTGAACCTCAGCGCATGGTTGGTACTTCGCTAGCCGCATCAGCTGACACTTATTCAGGGCCTAGCAGGCTCCTGCCGTGTCGTAGGCTAAGCAGGTAGCGCACATATTTCTCTGGGTAAGCGTAACCACGTAAGTTGTAAGAGTAGGGATGAGTCCGGACGATATTACCGACTCAAGCATACGCACCGCCTGAGGACGGGTATCCGGAACACCTATACGCCCTAGGGAGACAGCTGGTCTCCGGCGACTGACATCAGAAAGCGTCAAAGACAATGGGAGTAGAAGGTCAACATATAATCGTCAGAGCACGATGAGACTCTAGTCCCTCCGCATCGTATGTTAACAGGTCTTGTTGGATATCAATTAGAGTTGTAGCTGGTGCCAGAGTTGGACTTAGAGCTACAAACTTTAGCCCTATGATCGCGTCAGAGTTAGTCAAGGGAGGGCCACTCAGGGCATTGGGCCAGGCTAAAGCCGCGAG
CTCGCGGCTTTAGCCTGGCCCAATGCCCTGAGTGGCCCTCCCTTGACTAACTCTGACGCGATCAGAGGGCTAAAGTTTGTAGCTCTAAGTCCAACTCTGGCACCAGCTACAACTCTAATTGATATCCAACAAGACCTGTTAACATACGATGCGGAGGGACTAGAGTCTCATCGTGCTCTGACGATTATATGTTGACCTTCTACTCCCATTGTCTTTGACGCTTTCTGATGTCAGTCGCCGGAGACCAGCTGTCTCCCTAGGGCGTATAGGTGTTCCGGATACCCGTCCTCAGGCGGTGCGTATGCTTGAGTCGGTAATATCGTCCGGACTCATCCCTACTCTTACAACTTACGTGGTTACGCTTACCCAGAGAAATATGTGCGCTACCTGCTTAGCCTACGACACGGCAGGAGCCTGCTAGGCCCTGAATAAGTGTCAGCTGATGCGGCTAGCGAAGTACCAACCATGCGCTGAGGTTCAAGGCTGTCCAACCTAGATG
8 changes: 4 additions & 4 deletions ariba/tests/data/assembly_run_fermilite.expected.log
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
Fermilite assembly stats:
Min_count Contig_number Mean_length Longest
4 3 230.333 428
4 1 499 499
8 1 499 499
12 1 428 428
16 1 428 428
20 1 384 384
25 1 303 303
30 1 304 304
Best assembly is from min_count 8
25 1 384 384
30 1 373 373
Best assembly is from min_count 4
10 changes: 5 additions & 5 deletions ariba/tests/data/ref_preparer_test_run.out/02.cdhit.clusters.tsv
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
0 noncoding1 noncoding2.var_only noncoding3.var_only
1 noncoding4.var_only
2 gene1 gene2
3 gene3
4 gene4.var_only
cluster_1 gene1 gene2
cluster_2 gene3
gene4 gene4.var_only
noncoding- noncoding1 noncoding2.var_only noncoding3.var_only
noncoding4 noncoding4.var_only
33 changes: 33 additions & 0 deletions ariba/tests/ref_preparer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,39 @@


class TestRefPreparer(unittest.TestCase):
def test_rename_clusters(self):
'''test _rename_clusters'''
clusters_in = {
'0': {'no_dot_in_name'},
'1': {'another_no_dot_in_name'},
'2': {'foo.blah_blah_blah', 'foo.xyz'},
'3': {'foo.abc', 'foo.def'},
'4': {'pre1.abc', 'pre2.abc'},
'5': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
'6': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
'7': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
'8': {'xyz.1', 'xyz.2', 'abcdefgh'},
'9': {'a.foo', 'a.bar'},
}

expected = {
'cluster_1': {'no_dot_in_name'},
'cluster_2': {'another_no_dot_in_name'},
'foo_1': {'foo.blah_blah_blah', 'foo.xyz'},
'foo_2': {'foo.abc', 'foo.def'},
'pre-_1': {'pre1.abc', 'pre2.abc'},
'pre-_2': {'pre1.def', 'pre2.pqr', 'pre2.zxy'},
'prefix1+_1': {'prefix1.abc', 'prefix1.def', 'something_else.abc'},
'prefix1+_2': {'prefix1.fgh', 'prefix1.ijk', 'something_else_again.abc'},
'xyz+': {'xyz.1', 'xyz.2', 'abcdefgh'},
'cluster_3': {'a.foo', 'a.bar'},
}

self.maxDiff = None
got = ref_preparer.RefPreparer._rename_clusters(clusters_in)
self.assertEqual(expected, got)


def test_run(self):
'''test run'''
fasta_in = [
Expand Down
2 changes: 1 addition & 1 deletion scripts/ariba
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ input_group.add_argument('-m', '--metadata', action='append', dest='tsv_files',

cdhit_group = subparser_prepareref.add_argument_group('cd-hit options')
cdhit_group.add_argument('--no_cdhit', action='store_true', help='Do not run cd-hit. Each input sequence is put into its own "cluster". Incompatible with --cdhit_clusters.')
cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. First name in line is the cluster representative. Incompatible with --no_cdhit', metavar='FILENAME')
cdhit_group.add_argument('--cdhit_clusters', help='File specifying how the sequences should be clustered. Will be used instead of running cdhit. Format is one cluster per line. Sequence names separated by whitespace. Incompatible with --no_cdhit', metavar='FILENAME')
cdhit_group.add_argument('--cdhit_min_id', type=float, help='Sequence identity threshold (cd-hit option -c) [%(default)s]', default=0.9, metavar='FLOAT')
cdhit_group.add_argument('--cdhit_min_length', type=float, help='length difference cutoff (cd-hit option -s) [%(default)s]', default=0.9, metavar='FLOAT')

Expand Down
20 changes: 15 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from setuptools import setup, find_packages, Extension

minimap_c_files = [
'minimap_ariba.cpp',
'bseq.c',
'index.c',
'kthread.c',
Expand All @@ -15,14 +14,19 @@
'sketch.c',
]

minimap_c_files = [os.path.join('third_party', 'minimap', x) for x in minimap_c_files]
minimap_mod = Extension("minimap_ariba", minimap_c_files, extra_link_args=['-lz'])
minimap_c_files = [os.path.join('third_party', 'minimap-0.2', x) for x in minimap_c_files]
minimap_c_files.append(os.path.join('ariba', 'ext', 'minimap_ariba.cpp'))
minimap_mod = Extension(
"minimap_ariba",
minimap_c_files,
extra_link_args=['-lz'],
include_dirs=[os.path.join('third_party', 'minimap-0.2')],
)

fermilite_c_files = [
'bfc.c',
'bseq.c',
'bubble.c',
'fml-asm_ariba.cpp',
'htab.c',
'ksw.c',
'kthread.c',
Expand All @@ -35,7 +39,13 @@
'unitig.c'
]
fermilite_c_files = [os.path.join('third_party', 'fermi-lite-0.1', x) for x in fermilite_c_files]
fermilite_mod = Extension("fermilite_ariba", fermilite_c_files, extra_link_args=['-lz'])
fermilite_c_files.append(os.path.join('ariba', 'ext', 'fml-asm_ariba.cpp'))
fermilite_mod = Extension(
"fermilite_ariba",
fermilite_c_files,
extra_link_args=['-lz'],
include_dirs=[os.path.join('third_party', 'fermi-lite-0.1')],
)


setup(
Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.