Skip to content

Commit

Permalink
Improved al code
Browse files Browse the repository at this point in the history
  • Loading branch information
jfmrod committed Jan 12, 2015
1 parent 8de0c2d commit ff8a369
Show file tree
Hide file tree
Showing 16 changed files with 612 additions and 300 deletions.
652 changes: 420 additions & 232 deletions Makefile

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ if HAVE_MPILIB
MPIBIN=hpc-clust-mpi
endif

TESTS=tests/single-sl.sh tests/mpi-sl.sh
TESTS=tests/single-al.sh tests/single-sl.sh tests/mpi-sl.sh


#man_MANS=man/hpc-clust.1 man/hpc-clust-mpi.1
Expand Down
6 changes: 5 additions & 1 deletion README
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

HPC-CLUST v1.1.0 (5 June 2014)
HPC-CLUST v1.1.1 (21 October 2014)
by Joao F. Matias Rodrigues and Christian von Mering
Institute of Molecular Life Sciences, University of Zurich, Switzerland
Matias Rodrigues JF, Mering C von. HPC-CLUST: Distributed hierarchical clustering for very large sets of nucleotide sequences. Bioinformatics. 2013.
Expand Down Expand Up @@ -220,6 +220,10 @@ Sebastian Schmidt
=================================================
6. HISTORY

1.1.1 (21 October 2014)
- Fixed bug in make-otus.sh when using fasta files
- Added make-otus-mothur.sh to create otu list files in mothur format

1.1.0 (5 June 2014)
- Added test suite
- Fixed bug in mpi version introducted after changing to long indices
Expand Down
4 changes: 2 additions & 2 deletions configure.in
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
AC_INIT(hpc-clust,1.1.0,[email protected])
AC_INIT(hpc-clust,1.2.0,[email protected])

: ${CXXFLAGS=-O3}

Expand Down Expand Up @@ -40,7 +40,7 @@ case "${target_os}" in
esac


AM_INIT_AUTOMAKE("hpc-clust","1.1.0")
AM_INIT_AUTOMAKE("hpc-clust","1.1.1")

#AC_CHECK_LIB(z,main)

Expand Down
100 changes: 81 additions & 19 deletions eseqclusteravg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,19 @@ void eseqclusteravg::check(ebasicarray<eseqdistCount>& dists)
cout << "# no duplicates found!" << endl;
}

void eseqclusteravg::init(INDTYPE count,const estr& filename,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist)
void eseqclusteravg::init(INDTYPE count,const estr& filename,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist,float _thres,float (_fdist)(const estr&,const estr&,int),estrarray& _seqarr,int _seqlen)
{
thres=_thres;
seqarr=&_seqarr;
seqlen=_seqlen;
fdist=_fdist;
ofile.open(filename,"w");
ofile.write("# seqsfile: "+seqsfile+"\n");
ofile.write("# OTU_count Merge_distance Merged_OTU_id1 Merged_OTU_id2\n");
long i,j;
incmaxdist=0.0;
incmaxdist=1.0;
incmaxit=smatrix.end();
cf=0;
lastdist=0.0;
scount.reserve(count);
scluster.reserve(count);
Expand Down Expand Up @@ -231,17 +237,23 @@ void eseqclusteravg::merge(const eseqdistCount& sdist)
if (scount[tmpit->x]*scount[tmpit->y]==tmpit->count){
// ldieif(tmpit->dist>sdist.dist,"sdist.dist: "+estr(sdist.dist)+" tmpit.dist: "+tmpit->dist+" tmpit.count: "+tmpit->count);
completemerges.insert(*tmpit);
if (tmpit == incmaxit)
incmaxit=smatrix.end();
}
if (tmpit2==incmaxit)
incmaxit=smatrix.end();
smatrix.erase(tmpit2);
}
++mergecount;
}

float eseqclusteravg::getIncompleteMaxDist(float newdist)
//float eseqclusteravg::getIncompleteMaxDist(float newdist)
void eseqclusteravg::getIncompleteMaxDist(float newdist,float &maxdist,eseqdistavghash::iter& maxit)
{
float maxdist=0.0;
if (smatrix.size()==0){ maxit=smatrix.end(); maxdist=newdist; return; }
maxdist=0.0;
float tmpdist;
eseqdistavghash::iter it,maxit;
eseqdistavghash::iter it;
maxit=smatrix.end();
for (it=smatrix.begin(); it!=smatrix.end(); ++it){
// lassert(scount[it->x]==0 || scount[it->y]==0);
Expand All @@ -251,10 +263,8 @@ float eseqclusteravg::getIncompleteMaxDist(float newdist)
if (tmpdist>maxdist) { maxdist=tmpdist; maxit=it; }
}
}
if (maxit!=smatrix.end()){
if (maxit!=smatrix.end())
cout << "# maxit: " << *maxit << " scount[x]: " << scount[maxit->x] << " scount[y]: "<< scount[maxit->y] << " nextthres: " << (completemerges.size()?estr( ((double)scount[maxit->x]*scount[maxit->y]*completemerges.begin()->dist-(double)maxit->count*maxit->dist)/(double)(scount[maxit->x]*scount[maxit->y]-maxit->count)):estr("n/a")) << endl;
}
return(maxdist);
}

void eseqclusteravg::clearComplete()
Expand All @@ -273,30 +283,74 @@ void eseqclusteravg::mergeComplete(float dist)
{
multiset<eseqdistCount>::iterator it,it_next;
eseqdistavghash::iter sit;
for (it=completemerges.begin(); it!=completemerges.end() && it->dist>=dist; it=completemerges.begin()){
for (it=completemerges.begin(); it!=completemerges.end() && incmaxit != smatrix.end() && it->dist>=incmaxdist || incmaxit==smatrix.end() && it->dist>=dist; it=completemerges.begin()){
sit=smatrix.get(*it);
if (sit==smatrix.end() || sit->dist!=it->dist || scount[sit->x]==0 || scount[sit->y]==0 || scount[sit->x]*scount[sit->y]!=sit->count) { completemerges.erase(it); continue; }
if (sit->dist<thres) return; // do not merge past threshold (possible missing links)

lassert(scluster[sit->x]!=sit->x || scluster[sit->y]!=sit->y);
cout << "* " << scluster.size()-mergecount << " " << sit->dist << " ("<<sit->dist<<") " << sit->x << " " << scount[sit->x]<< " " << sit->y << " " << scount[sit->y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl;
ofile.write(estr(scluster.size()-mergecount)+" "+sit->dist+" "+sit->x+" "+sit->y+"\n");
merge(*sit);
ofile.write(estr(scluster.size()-mergecount)+" "+sit->dist+" "+sit->x+" "+sit->y+"\n");
smatrix.erase(sit);
completemerges.erase(it);
if (incmaxit==smatrix.end())
getIncompleteMaxDist(dist,incmaxdist,incmaxit);
}
ofile.flush();
}

void eseqclusteravg::finalize()
{
while (incmaxit!=smatrix.end()){
while (completemerges.size() && (incmaxit!=smatrix.end() && completemerges.begin()->dist>=incmaxdist || incmaxit==smatrix.end() && completemerges.begin()->dist>=lastdist)){
if (completemerges.begin()->dist<thres) return;
mergeComplete(lastdist);
}
double avgdist=0.0;
std::list<long> &arr(incluster[incmaxit->x]);
std::list<long> &arr2(incluster[incmaxit->y]);
for (std::list<long>::iterator i=arr.begin(); i!=arr.end(); ++i){
for (std::list<long>::iterator j=arr2.begin(); j!=arr2.end(); ++j){
// cout << "dist: " << *i << "," << *j << ": " << fdist(*i,*j) << endl;
avgdist+=fdist((*seqarr).values(*i),(*seqarr).values(*j),seqlen);
}
}
avgdist=avgdist/(arr.size()*arr2.size());
incmaxit->count=arr.size()*arr2.size();
incmaxit->dist=avgdist;
cout << "# calculating avg cluster distance: " << incmaxit->x << "," << incmaxit->y << " : " << incmaxdist << " : " << avgdist << endl;
completemerges.insert(*incmaxit);
getIncompleteMaxDist(lastdist,incmaxdist,incmaxit);
}
}


void eseqclusteravg::add(const eseqdist& sdist){
ldieif(sdist.x<0 || sdist.y<0 || sdist.x>=scluster.size() || sdist.y>=scluster.size(),"out of bounds: sdist.x: "+estr(sdist.x)+" sdist.y: "+estr(sdist.y)+" scluster.size(): "+estr(scluster.size()));

if (lastdist!=sdist.dist){
incmaxdist=getIncompleteMaxDist(sdist.dist);
if (lastdist != sdist.dist){
if (incmaxit!=smatrix.end())
incmaxdist=((double)incmaxit->count*incmaxit->dist+(double)(scount[incmaxit->x]*scount[incmaxit->y]-incmaxit->count)*sdist.dist)/(double)(scount[incmaxit->x]*scount[incmaxit->y]);
else
getIncompleteMaxDist(sdist.dist,incmaxdist,incmaxit);
}

if (completemerges.size()>0l && completemerges.begin()->dist>=incmaxdist){
cout << "# trying merge: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl;
cout << "# merging: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl;
long tmpmc=mergecount;
while (completemerges.size() && completemerges.begin()->dist>=incmaxdist){
mergeComplete(incmaxdist);
incmaxdist=getIncompleteMaxDist(sdist.dist);
mergeComplete(sdist.dist);
}
clearComplete();
lastdist=sdist.dist;
if (tmpmc!=mergecount)
clearComplete();
cf=completemerges.size()/100000;
cout << "# after merge: smatrix: " << smatrix.size() << " completemerges: " << completemerges.size() << " cf: " << cf << " dist: " << sdist.dist << " incmaxdist: " << incmaxdist << " topdist: " << completemerges.begin()->dist << " " << mergecount << endl;
++cf;
}
lastdist=sdist.dist;

eseqdistCount tmpdist;
tmpdist.x=scluster[sdist.x];
Expand All @@ -321,9 +375,10 @@ void eseqclusteravg::add(const eseqdist& sdist){
if (it==smatrix.end()){
if (scount[tmpdist.x]*scount[tmpdist.y]==tmpdist.count){
if (tmpdist.dist>=incmaxdist){
cout << "1 " << scluster.size()-mergecount << " " << tmpdist.dist << " ("<<tmpdist.dist<<") " << tmpdist.x << " " << scount[tmpdist.x]<< " " << tmpdist.y << " " << scount[tmpdist.y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl;
ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n");
// cout << "1 " << scluster.size()-mergecount << " " << tmpdist.dist << " ("<<tmpdist.dist<<") " << tmpdist.x << " " << scount[tmpdist.x]<< " " << tmpdist.y << " " << scount[tmpdist.y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl;
merge(tmpdist);
ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n");
ofile.flush();
}else{
smatrix.add(tmpdist,tmpdist);
inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x);
Expand All @@ -332,6 +387,8 @@ void eseqclusteravg::add(const eseqdist& sdist){
}else{
smatrix.add(tmpdist,tmpdist);
inter[tmpdist.x].push_back(tmpdist.y); inter[tmpdist.y].push_back(tmpdist.x);
if (incmaxit==smatrix.end())
incmaxit=smatrix.get(tmpdist);
}
return;
}
Expand All @@ -342,9 +399,10 @@ void eseqclusteravg::add(const eseqdist& sdist){
// complete linkage
if (it->count==scount[tmpdist.x]*scount[tmpdist.y]){
if (it->dist>=incmaxdist){
cout << "+ " << scluster.size()-mergecount << " " << it->dist << " ("<<tmpdist.dist<<") " << it->x << " " << scount[it->x]<< " " << it->y << " " << scount[it->y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl;
ofile.write(estr(scluster.size()-mergecount)+" "+it->dist+" "+it->x+" "+it->y+"\n");
// cout << "+ " << scluster.size()-mergecount << " " << it->dist << " ("<<tmpdist.dist<<") " << it->x << " " << scount[it->x]<< " " << it->y << " " << scount[it->y] << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << " " << (completemerges.size()?estr(completemerges.begin()->dist):estr("n/a")) << endl;
merge(*it);
ofile.write(estr(scluster.size()-mergecount)+" "+it->dist+" "+it->x+" "+it->y+"\n");
ofile.flush();
smatrix.erase(it);
// incmaxdist=getIncompleteMaxDist(sdist.dist);
// while (completemerges.size() && completemerges.begin()->dist>=incmaxdist){
Expand All @@ -355,6 +413,8 @@ void eseqclusteravg::add(const eseqdist& sdist){
completemerges.insert(*it);
// cout << "# " << scluster.size()-mergecount << " " << tmpdist.dist << " " << tmpdist.x << " " << tmpdist.y << " " << smatrix.size() << " " << completemerges.size() << " " << incmaxdist << endl;
}
if (it == incmaxit || incmaxit==smatrix.end())
getIncompleteMaxDist(sdist.dist,incmaxdist,incmaxit);
}
}

Expand Down Expand Up @@ -388,6 +448,7 @@ void eseqclusteravg::add(const eseqdistCount& sdist){
if (scount[tmpdist.x]*scount[tmpdist.y]==sdist.count){
merge(tmpdist);
ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n");
ofile.flush();
// cout << scluster.size()-mergecount << " " << sdist.dist << " " << sdist.x << " " << sdist.y << endl;
}else{
smatrix.add(tmpdist,tmpdist);
Expand All @@ -405,6 +466,7 @@ void eseqclusteravg::add(const eseqdistCount& sdist){
merge(tmpdist);
// update(ind-1,x,y);
ofile.write(estr(scluster.size()-mergecount)+" "+tmpdist.dist+" "+tmpdist.x+" "+tmpdist.y+"\n");
ofile.flush();
// cout << scluster.size()-mergecount << " " << tmpdist.dist << " " << tmpdist.x << " " << tmpdist.y << endl;
// sleep(1);
// cout << sdist.dist << " " << x << " " << y << endl;
Expand Down
15 changes: 13 additions & 2 deletions eseqclusteravg.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,15 @@ class eseqclusteravg
{
public:
INDTYPE mergecount;
INDTYPE cf;

float thres;
int seqlen;
estrarray *seqarr;
float (*fdist)(const estr&,const estr&,int);

efile ofile;
eseqdistavghash::iter incmaxit;
float incmaxdist;
float lastdist;

Expand All @@ -38,7 +46,9 @@ class eseqclusteravg

eseqclusteravg();

void init(INDTYPE count,const estr& ofile,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist);
void init(INDTYPE count,const estr& ofile,const estr& seqsfile,const earray<ebasicarray<INDTYPE> >& dupslist,float thres,float (fdist)(const estr&,const estr&,int),estrarray& arr,int seqlen);

void finalize();

void merge(const eseqdistCount& dist);

Expand All @@ -48,7 +58,8 @@ class eseqclusteravg

long update(eblockarray<eseqdistCount>& dists,long s);

float getIncompleteMaxDist(float newdist);
// float getIncompleteMaxDist(float newdist);
void getIncompleteMaxDist(float newdist,float &maxdist,eseqdistavghash::iter& maxit);
void mergeComplete(float dist);
void clearComplete();

Expand Down
16 changes: 8 additions & 8 deletions eutils/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ CPPFLAGS =
CXX = g++
CXXCPP = g++ -E
CXXDEPMODE = depmode=gcc3
CXXFLAGS = -O2 -pthread -I/opt/openmpi/include -pthread
CXXFLAGS = -O2 -pthread -I/usr/lib/openmpi/include -I/usr/lib/openmpi/include/openmpi -pthread
CYGPATH_W = echo
DEFS = -DHAVE_CONFIG_H
DEPDIR = .deps
Expand All @@ -199,9 +199,9 @@ ECHO_C =
ECHO_N = -n
ECHO_T =
EGREP = /bin/grep -E
EUTILS_CXXFLAGS = -I${prefix}/include -O2 -pthread -I/opt/openmpi/include -pthread
EUTILS_LDFLAGS = -pthread -L/opt/openmpi/lib -lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl
EUTILS_LIBS = eutils/libeutils.a -lmpi_cxx -lrt -lmpi
EUTILS_CXXFLAGS = -I${prefix}/include -O2 -pthread -I/usr/lib/openmpi/include -I/usr/lib/openmpi/include/openmpi -pthread
EUTILS_LDFLAGS = -pthread -L/usr/lib/openmpi/lib -lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl
EUTILS_LIBS = eutils/libeutils.a -lmpi++ -lmpi_cxx -lrt -lmpi
EXEEXT =
FGREP = /bin/grep -F
GREP = /bin/grep
Expand All @@ -211,9 +211,9 @@ INSTALL_PROGRAM = ${INSTALL}
INSTALL_SCRIPT = ${INSTALL}
INSTALL_STRIP_PROGRAM = $(install_sh) -c -s
LD = /usr/bin/ld -m elf_x86_64
LDFLAGS = -pthread -L/opt/openmpi/lib -lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl
LDFLAGS = -pthread -L/usr/lib/openmpi/lib -lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl
LIBOBJS =
LIBS = -lmpi_cxx -lrt -lmpi
LIBS = -lmpi++ -lmpi_cxx -lrt -lmpi
LIBTOOL = $(SHELL) $(top_builddir)/libtool
LIPO =
LN_S = ln -s
Expand Down Expand Up @@ -245,7 +245,7 @@ PTHREAD_LIBS =
RANLIB = ranlib
SED = /bin/sed
SET_MAKE =
SHELL = /bin/sh
SHELL = /bin/bash
STRIP = strip
VERSION = 1.0.0
abs_builddir = /home/jfmrod/erisdb/work/hpc-clust/eutils
Expand Down Expand Up @@ -291,7 +291,7 @@ mandir = ${datarootdir}/man
mkdir_p = /bin/mkdir -p
oldincludedir = /usr/include
pdfdir = ${docdir}
prefix = /home/jfmrod/usr
prefix = /home/jfmrod/gaia/usr
program_transform_name = s,x,x,
psdir = ${docdir}
sbindir = ${exec_prefix}/sbin
Expand Down
6 changes: 3 additions & 3 deletions eutils/eutils-config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#! /bin/sh

prefix=/home/jfmrod/usr
prefix=/home/jfmrod/gaia/usr
exec_prefix=${prefix}
includedir=${prefix}/include

Expand Down Expand Up @@ -54,11 +54,11 @@ while test $# -gt 0; do
;;

--cxxflags)
echo -I${prefix}/include -O2 -pthread -I/opt/openmpi/include -pthread
echo -I${prefix}/include -O2 -pthread -I/usr/lib/openmpi/include -I/usr/lib/openmpi/include/openmpi -pthread
;;

--libs)
echo -pthread -L/opt/openmpi/lib -lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl eutils/libeutils.a -lmpi_cxx -lrt -lmpi
echo -pthread -L/usr/lib/openmpi/lib -lmpi_cxx -lmpi -lopen-rte -lopen-pal -ldl -Wl,--export-dynamic -lnsl -lutil -lm -ldl eutils/libeutils.a -lmpi++ -lmpi_cxx -lrt -lmpi
;;

*)
Expand Down
4 changes: 3 additions & 1 deletion eutils/eutils-config.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@
#endif

/* Define to 1 if you have the `mpi++' library (-lmpi++). */
/* #undef HAVE_LIBMPI__ */
#ifndef EUTILS_HAVE_LIBMPI__
#define EUTILS_HAVE_LIBMPI__ 1
#endif

/* Define to 1 if you have the `rt' library (-lrt). */
#ifndef EUTILS_HAVE_LIBRT
Expand Down
6 changes: 3 additions & 3 deletions eutils/eutils.pc
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
prefix=/home/jfmrod/usr
prefix=/home/jfmrod/gaia/usr
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include

Name: easycpp.eutils
Description: The easy programming library
Version: 1.0.0
Libs: eutils/libeutils.a -lmpi_cxx -lrt -lmpi -lmpi_cxx -lrt -lmpi
Cflags: -I${prefix}/include -O2 -pthread -I/opt/openmpi/include -pthread
Libs: eutils/libeutils.a -lmpi++ -lmpi_cxx -lrt -lmpi -lmpi++ -lmpi_cxx -lrt -lmpi
Cflags: -I${prefix}/include -O2 -pthread -I/usr/lib/openmpi/include -I/usr/lib/openmpi/include/openmpi -pthread
Loading

0 comments on commit ff8a369

Please sign in to comment.