Skip to content

Commit

Permalink
Merge pull request #13 from althonos/impl-nhmmer
Browse files Browse the repository at this point in the history
Implement long targets pipeline for consistent `nhmmer` results
  • Loading branch information
althonos authored Nov 11, 2021
2 parents eb6fe7c + 3d79ab4 commit 704fd88
Show file tree
Hide file tree
Showing 26 changed files with 64,223 additions and 210 deletions.
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,18 @@ Module.symvers
Mkfile.old
dkms.conf

# Valgrind files
callgrind.out.*

### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.c
*.html
pyhmmer/*.c
pyhmmer/*.html

# Cython
cython_debug/
Expand Down
3 changes: 1 addition & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

### Changed
- `plan7.Cutoffs` now support setting the bit score cutoffs, but requires both to be set or cleared at the same time.
- `easel.Vector` will always allocate some memory when
created manually to avoid having a special empty case in every vector method.
- `easel.Vector` will always allocate some memory when created manually to avoid having a special empty case in every vector method.
- `pyhmmer.easel.AllocationError` now stores the size it failed to allocate, and the number of elements when allocating an array.

### Fixed
Expand Down
24 changes: 24 additions & 0 deletions include/libhmmer/fm.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t, int64_t


cdef extern from "hmmer.h" nogil:

cdef struct fm_data_s:
uint64_t N
uint32_t term_loc
uint32_t seq_offset
uint32_t ambig_offset
uint32_t seq_cnt
uint32_t ambig_cnt
uint32_t overlap
uint8_t* T
uint8_t* BWT_mem
uint8_t* BWT
uint32_t* SA
int64_t* C
uint32_t* occCnts_sb
uint16_t* occCnts_b
ctypedef fm_data_s FM_DATA

ctypedef struct FM_CFG:
pass
7 changes: 3 additions & 4 deletions include/libhmmer/p7_alidisplay.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ from libeasel.alphabet cimport ESL_ALPHABET
from libeasel.random cimport ESL_RANDOMNESS
from libeasel.sq cimport ESL_SQ
from libhmmer.p7_trace cimport P7_TRACE
from libhmmer.p7_pipeline cimport P7_PIPELINE

IF HMMER_IMPL == "VMX":
from libhmmer.impl_vmx.p7_oprofile cimport P7_OPROFILE
Expand All @@ -15,7 +14,6 @@ ELIF HMMER_IMPL == "SSE":

cdef extern from "hmmer.h" nogil:

ctypedef p7_alidisplay_s P7_ALIDISPLAY
cdef struct p7_alidisplay_s:
char* rfline
char* mmline
Expand Down Expand Up @@ -43,6 +41,7 @@ cdef extern from "hmmer.h" nogil:

int memsize
char* mem
ctypedef p7_alidisplay_s P7_ALIDISPLAY

P7_ALIDISPLAY *p7_alidisplay_Create(const P7_TRACE *tr, int which, const P7_OPROFILE *om, const ESL_SQ *sq, const ESL_SQ *ntsq);
P7_ALIDISPLAY *p7_alidisplay_Create_empty();
Expand All @@ -57,8 +56,8 @@ cdef extern from "hmmer.h" nogil:
float p7_alidisplay_DecodePostProb(char pc);
char p7_alidisplay_EncodeAliPostProb(float p, float hi, float med, float lo);

int p7_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli);
int p7_translated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli);
# int p7_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli);
# int p7_translated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, P7_PIPELINE *pli);
int p7_nontranslated_alidisplay_Print(FILE *fp, P7_ALIDISPLAY *ad, int min_aliwidth, int linewidth, int show_accessions);

int p7_alidisplay_Backconvert(const P7_ALIDISPLAY *ad, const ESL_ALPHABET *abc, ESL_SQ **ret_sq, P7_TRACE **ret_tr);
Expand Down
6 changes: 1 addition & 5 deletions include/libhmmer/p7_domain.pxd
Original file line number Diff line number Diff line change
@@ -1,15 +1,11 @@
from libc.stdint cimport uint8_t, uint32_t, int64_t

from libeasel.rand64 cimport ESL_RAND64
from libhmmer.p7_alidisplay cimport P7_ALIDISPLAY


cdef extern from "hmmer.h" nogil:

# forward declaration of P7_ALIDISPLAY to avoid circular imports
ctypedef p7_alidisplay_s P7_ALIDISPLAY
cdef struct p7_alidisplay_s:
pass

ctypedef p7_dom_s P7_DOMAIN
cdef struct p7_dom_s:
int64_t ienv
Expand Down
9 changes: 5 additions & 4 deletions include/libhmmer/p7_domaindef.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ from libeasel.sq cimport ESL_SQ
from libhmmer.p7_domain cimport P7_DOMAIN
from libhmmer.p7_spensemble cimport P7_SPENSEMBLE
from libhmmer.p7_trace cimport P7_TRACE
from libhmmer.p7_alidisplay cimport P7_ALIDISPLAY

IF HMMER_IMPL == "VMX":
from libhmmer.impl_vmx.p7_omx cimport P7_OMX
Expand All @@ -16,7 +17,6 @@ ELIF HMMER_IMPL == "SSE":

cdef extern from "hmmer.h" nogil:

ctypedef p7_domaindef_s P7_DOMAINDEF
cdef struct p7_domaindef_s:

float* mocc
Expand Down Expand Up @@ -53,13 +53,14 @@ cdef extern from "hmmer.h" nogil:
int nclustered
int noverlaps
int nenvelopes
ctypedef p7_domaindef_s P7_DOMAINDEF


P7_DOMAINDEF *p7_domaindef_Create (ESL_RANDOMNESS *r)
# int p7_domaindef_Fetch (P7_DOMAINDEF *ddef, int which, int *opt_i, int *opt_j, float *opt_sc, P7_ALIDISPLAY **opt_ad)
# int p7_domaindef_GrowTo (P7_DOMAINDEF *ddef, int L)
int p7_domaindef_Fetch (P7_DOMAINDEF *ddef, int which, int *opt_i, int *opt_j, float *opt_sc, P7_ALIDISPLAY **opt_ad)
int p7_domaindef_GrowTo (P7_DOMAINDEF *ddef, int L)
int p7_domaindef_Reuse (P7_DOMAINDEF *ddef)
# int p7_domaindef_DumpPosteriors(FILE *ofp, P7_DOMAINDEF *ddef)
int p7_domaindef_DumpPosteriors(FILE *ofp, P7_DOMAINDEF *ddef)
void p7_domaindef_Destroy(P7_DOMAINDEF *ddef)

# int p7_domaindef_ByViterbi (P7_PROFILE *gm, const ESL_SQ *sq, const ESL_SQ *ntsq, P7_GMX *gx1, P7_GMX *gx2, P7_DOMAINDEF *ddef)
Expand Down
35 changes: 27 additions & 8 deletions include/libhmmer/p7_pipeline.pxd
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
from libc.stdint cimport uint64_t
from libc.stdint cimport uint64_t, int64_t
from libc.stdio cimport FILE

from libeasel.sq cimport ESL_SQ
from libeasel.getopts cimport ESL_GETOPTS
from libeasel.random cimport ESL_RANDOMNESS
from libhmmer.fm cimport FM_DATA, FM_CFG
from libhmmer.p7_bg cimport P7_BG
from libhmmer.p7_domaindef cimport P7_DOMAINDEF
from libhmmer.p7_tophits cimport P7_TOPHITS
from libhmmer.p7_hmmfile cimport P7_HMMFILE
from libhmmer.p7_scoredata cimport P7_SCOREDATA

IF HMMER_IMPL == "VMX":
from libhmmer.impl_vmx.p7_omx cimport P7_OMX
Expand All @@ -33,8 +36,13 @@ cdef extern from "hmmer.h" nogil:
p7_ZSETBY_FILEINFO = 2

cdef enum p7_complementarity_e:
p7_NOCOMPLEMENT = 0
p7_COMPLEMENT = 1
p7_NOCOMPLEMENT = 0
p7_COMPLEMENT = 1

cdef enum p7_strands_e:
p7_STRAND_TOPONLY = 0
p7_STRAND_BOTTOMONLY = 1
p7_STRAND_BOTH = 2

# ctypedef p7_pipeline_s P7_PIPELINE
ctypedef struct P7_PIPELINE:
Expand Down Expand Up @@ -95,7 +103,7 @@ cdef extern from "hmmer.h" nogil:

p7_pipemodes_e mode
bint long_targets
int strands
p7_strands_e strands
int W
int block_length

Expand All @@ -121,7 +129,18 @@ cdef extern from "hmmer.h" nogil:
int p7_pli_NewModelThresholds(P7_PIPELINE *pli, const P7_OPROFILE *om)
int p7_pli_NewSeq (P7_PIPELINE *pli, const ESL_SQ *sq)
int p7_Pipeline (P7_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, const ESL_SQ *sq, const ESL_SQ *ntsq, P7_TOPHITS *th)
# int p7_Pipeline_LongTarget (P7_PIPELINE *pli, P7_OPROFILE *om, P7_SCOREDATA *data,
# P7_BG *bg, P7_TOPHITS *hitlist, int64_t seqidx,
# const ESL_SQ *sq, int complementarity,
# const FM_DATA *fmf, const FM_DATA *fmb, FM_CFG *fm_cfg)
int p7_Pipeline_LongTarget (
P7_PIPELINE* pli,
P7_OPROFILE* om,
P7_SCOREDATA* data,
P7_BG* bg,
P7_TOPHITS* hitlist,
int64_t seqidx,
const ESL_SQ* sq,
p7_complementarity_e complementarity,
const FM_DATA* fmf,
const FM_DATA* fmb,
FM_CFG* fm_cfg
)

int p7_pli_Statistics(FILE* ofp, P7_PIPELINE* pli, void* w)
28 changes: 28 additions & 0 deletions include/libhmmer/p7_scoredata.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from libhmmer.p7_profile cimport P7_PROFILE

IF HMMER_IMPL == "VMX":
from libhmmer.impl_vmx.p7_oprofile cimport P7_OPROFILE
ELIF HMMER_IMPL == "SSE":
from libhmmer.impl_sse.p7_oprofile cimport P7_OPROFILE

cdef extern from "hmmer.h" nogil:

cdef enum p7_scoredatatype_e:
p7_sd_std = 0
p7_sd_fm = 1

cdef struct p7_scoredata_s:
p7_scoredatatype_e type
int M
float* prefix_lengths
float* suffix_lengths
float* fwd_scores
float** fwd_transitions
float** opt_ext_fwd
float** opt_ext_rev
ctypedef p7_scoredata_s P7_SCOREDATA

cdef void p7_hmm_ScoreDataDestroy(P7_SCOREDATA* data)
cdef P7_SCOREDATA* p7_hmm_ScoreDataCreate(P7_OPROFILE* om, P7_PROFILE* gm)
cdef P7_SCOREDATA* p7_hmm_ScoreDataClone(P7_SCOREDATA* src, int Kp)
cdef int p7_hmm_ScoreDataComputeRest(P7_OPROFILE* om, P7_SCOREDATA* data)
11 changes: 4 additions & 7 deletions include/libhmmer/p7_tophits.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,11 @@ from libeasel.sq cimport ESL_SQ
from libhmmer.p7_domain cimport P7_DOMAIN
from libhmmer.p7_pipeline cimport p7_pipemodes_e, P7_PIPELINE
from libhmmer.p7_trace cimport P7_TRACE
from libhmmer.p7_alidisplay cimport P7_ALIDISPLAY


cdef extern from "hmmer.h" nogil:

# forward declaration of P7_ALIDISPLAY to avoid circular imports
ctypedef p7_alidisplay_s P7_ALIDISPLAY
cdef struct p7_alidisplay_s:
pass

# P7_HIT flags
cdef enum p7_hitflags_e:
p7_HITFLAGS_DEFAULT = 0b00000
Expand All @@ -28,7 +24,6 @@ cdef extern from "hmmer.h" nogil:
p7_IS_DROPPED = 0b01000
p7_IS_DUPLICATE = 0b10000

ctypedef p7_hit_s P7_HIT
cdef struct p7_hit_s:
char* name
char* acc
Expand Down Expand Up @@ -61,8 +56,9 @@ cdef extern from "hmmer.h" nogil:

P7_DOMAIN *dcl
esl_pos_t offset
ctypedef p7_hit_s P7_HIT


ctypedef p7_tophits_s P7_TOPHITS
cdef struct p7_tophits_s:
P7_HIT** hit
P7_HIT* unsrt
Expand All @@ -72,6 +68,7 @@ cdef extern from "hmmer.h" nogil:
uint64_t nincluded
bint is_sorted_by_sortkey
bint is_sorted_by_seqidx
ctypedef p7_tophits_s P7_TOPHITS

P7_TOPHITS *p7_tophits_Create()
int p7_tophits_Grow(P7_TOPHITS *h)
Expand Down
5 changes: 5 additions & 0 deletions pyhmmer/easel.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@ cdef class Alphabet:
cdef int _init_default(self, int ty) nogil except 1
cdef inline bint _eq(self, Alphabet other) nogil

cpdef bint is_dna(self)
cpdef bint is_rna(self)
cpdef bint is_amino(self)
cpdef bint is_nucleotide(self)


# --- Bitfield ---------------------------------------------------------------

Expand Down
4 changes: 4 additions & 0 deletions pyhmmer/easel.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@ class Alphabet(object):
def Kp(self) -> int: ...
@property
def symbols(self) -> str: ...
def is_dna(self) -> bool: ...
def is_rna(self) -> bool: ...
def is_amino(self) -> bool: ...
def is_nucleotide(self) -> bool: ...

# --- Bitfield ---------------------------------------------------------------

Expand Down
41 changes: 41 additions & 0 deletions pyhmmer/easel.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,47 @@ cdef class Alphabet:
cdef inline bint _eq(self, Alphabet other) nogil:
return self._abc.type == other._abc.type

# --- Methods ------------------------------------------------------------

cpdef bint is_dna(self):
"""is_dna(self)\n--
Check whether the `Alphabet` object is a DNA alphabet.
"""
assert self._abc != NULL
return self._abc.type == libeasel.alphabet.eslDNA

cpdef bint is_rna(self):
"""is_rna(self)\n--
Check whether the `Alphabet` object is a RNA alphabet.
"""
assert self._abc != NULL
return self._abc.type == libeasel.alphabet.eslRNA

cpdef bint is_amino(self):
"""is_amino(self)\n--
Check whether the `Alphabet` object is a protein alphabet.
"""
assert self._abc != NULL
return self._abc.type == libeasel.alphabet.eslAMINO

cpdef bint is_nucleotide(self):
"""is_nucleotide(self)\n--
Check whether the `Alphabet` object is a nucleotide alphabet.
"""
assert self._abc != NULL
return (
self._abc.type == libeasel.alphabet.eslDNA
or self._abc.type == libeasel.alphabet.eslRNA
)


# --- Bitfield ---------------------------------------------------------------

Expand Down
Loading

0 comments on commit 704fd88

Please sign in to comment.