Skip to content

Commit

Permalink
reHC*, first draft of the version able to automagically process multi…
Browse files Browse the repository at this point in the history
…-allelic genotypes and haplotypes.
  • Loading branch information
yp committed Jun 22, 2012
1 parent c0994f9 commit 7faad5d
Show file tree
Hide file tree
Showing 4 changed files with 237 additions and 37 deletions.
207 changes: 197 additions & 10 deletions include/ped2cnf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,15 @@ class ped2cnf_conv_t:
}
}

void add_individual_constraint(pedcnf_t& cnf,
const individual_t& ind,
const g& gen,
const size_t l,
const size_t i) {
void add_individual_constraint_bi(pedcnf_t& cnf,
const individual_t& ind,
const g& gen,
const size_t l,
const size_t i) {
/************
* Clauses for errors variables
************/
if (gen != g::MISS) {
if (is_genotyped(gen)) {
lit_t e= cnf.get_e(i, l);
lit_t p= cnf.get_p(i, l);
lit_t m= cnf.get_m(i, l);
Expand Down Expand Up @@ -191,13 +191,141 @@ class ped2cnf_conv_t:
************/
};

/************
* MULTIALLELIC VERSION
************/
void add_individual_constraint_multi(pedcnf_t& cnf,
const individual_t& ind,
const g& gen,
const size_t l,
const size_t i,
const size_t no_of_alleles) {
/************
* Extract pm and mm variables
************/
lit_t* p= new lit_t[no_of_alleles];
lit_t* m= new lit_t[no_of_alleles];
for (size_t j= 0; j<no_of_alleles; ++j) {
p[j]= cnf.get_pm(i, l, j);
m[j]= cnf.get_mm(i, l, j);
}
/************
* Consistency of haplotypes
* (exactly only one TRUE among {p,m}_i,j[l])
************/
// at least one
cnf.add_clause(p, no_of_alleles);
cnf.add_clause(m, no_of_alleles);
// at most one (quadratic version)
for (size_t j1= 0; j1<no_of_alleles-1; ++j1) {
for (size_t j2= j1+1; j2<no_of_alleles; ++j2) {
cnf.add_clause<2>((lit_t[]){ -p[j1], -p[j2]});
cnf.add_clause<2>((lit_t[]){ -m[j1], -m[j2]});
}
}
/************
* Clauses for errors variables
************/
if (is_genotyped(gen)) {
const lit_t e= cnf.get_e(i, l);
const size_t a= ((size_t)gen.allele1())-1;
const size_t b= ((size_t)gen.allele2())-1;
if (is_homozygous(gen)) {
cnf.add_clause<2>((lit_t[]){ e, p[a] });
cnf.add_clause<2>((lit_t[]){ e, m[a] });
cnf.add_clause<3>((lit_t[]){ -e, -p[a], -m[a] });
} else if (is_heterozygous(gen)) {
cnf.add_clause<3>((lit_t[]){ e, p[a], m[a] });
cnf.add_clause<3>((lit_t[]){ e, -p[a], -m[a] });
cnf.add_clause<3>((lit_t[]){ e, p[b], m[b] });
cnf.add_clause<3>((lit_t[]){ e, -p[b], -m[b] });

cnf.add_clause<3>((lit_t[]){ e, -p[a], -p[b] });
cnf.add_clause<3>((lit_t[]){ e, -m[a], -m[b] });

cnf.add_clause<5>((lit_t[]){ -e, -p[a], m[a], p[b], -m[b] });
cnf.add_clause<5>((lit_t[]){ -e, p[a], -m[a], -p[b], m[b] });
} else {
MY_FAIL;
}
}
/************
* End clauses for errors variables
************/
/************
* Clauses for recombination variables
************/
// (only for loci after the first)
if (l > 0) {
// a recombination event occurs only on heterozygous loci (on parent), thus:
// ( p_{i,l} == m_{i,l} ) implies not r_{i,l}
if (ind.has_father()) {
lit_t* clause= new lit_t[1+(2*no_of_alleles)];
clause[0]= -cnf.get_rp(i, l);
for (size_t j= 0; j<no_of_alleles; ++j) {
clause[1+(2*j)]= cnf.get_pm(ind.father().progr_id(), l, j);
clause[2+(2*j)]= cnf.get_mm(ind.father().progr_id(), l, j);
}
for (size_t j= 0; j<no_of_alleles; ++j) {
clause[1+(2*j)]= -clause[1+(2*j)];
clause[2+(2*j)]= -clause[2+(2*j)];
cnf.add_clause(clause, 1+(2*no_of_alleles));
clause[1+(2*j)]= -clause[1+(2*j)];
clause[2+(2*j)]= -clause[2+(2*j)];
}
delete [] clause;
}
if (ind.has_mother()) {
lit_t* clause= new lit_t[1+(2*no_of_alleles)];
clause[0]= -cnf.get_rm(i, l);
for (size_t j= 0; j<no_of_alleles; ++j) {
clause[1+(2*j)]= cnf.get_pm(ind.mother().progr_id(), l, j);
clause[2+(2*j)]= cnf.get_mm(ind.mother().progr_id(), l, j);
}
for (size_t j= 0; j<no_of_alleles; ++j) {
clause[1+(2*j)]= -clause[1+(2*j)];
clause[2+(2*j)]= -clause[2+(2*j)];
cnf.add_clause(clause, 1+(2*no_of_alleles));
clause[1+(2*j)]= -clause[1+(2*j)];
clause[2+(2*j)]= -clause[2+(2*j)];
}
delete [] clause;
}
}
/************
* End clauses for recombination variables
************/
delete [] p;
delete [] m;
};

void add_parental_constraint(pedcnf_t& cnf,
const g& parent_g,
const g& individual_g,
const size_t locus,
const size_t progr_id_parent,
const size_t progr_id_ind,
const bool is_mother) {
const bool is_mother,
const size_t no_of_alleles) {
if (no_of_alleles<=2) {
add_parental_constraint_bi(cnf, parent_g, individual_g,
locus, progr_id_parent, progr_id_ind,
is_mother);
} else {
add_parental_constraint_multi(cnf, parent_g, individual_g,
locus, progr_id_parent, progr_id_ind,
is_mother, no_of_alleles);
}
};

// maximum 2 alleles !!
void add_parental_constraint_bi(pedcnf_t& cnf,
const g& parent_g,
const g& individual_g,
const size_t locus,
const size_t progr_id_parent,
const size_t progr_id_ind,
const bool is_mother) {
/************
* Clauses for Mendelian consistency (s-variables)
************/
Expand Down Expand Up @@ -239,17 +367,75 @@ class ped2cnf_conv_t:
************/
};

// minimum 3 alleles !!
void add_parental_constraint_multi(pedcnf_t& cnf,
const g& parent_g,
const g& individual_g,
const size_t locus,
const size_t progr_id_parent,
const size_t progr_id_ind,
const bool is_mother,
const size_t no_of_alleles) {
/************
* Clauses for Mendelian consistency (s-variables)
************/
lit_t s= (!is_mother) ?
cnf.get_sp(progr_id_ind, locus) : cnf.get_sm(progr_id_ind, locus);
for (size_t j= 0; j<no_of_alleles; ++j) {
lit_t p= cnf.get_pm(progr_id_parent, locus, j);
lit_t m= cnf.get_mm(progr_id_parent, locus, j);
lit_t c= (!is_mother) ?
cnf.get_pm(progr_id_ind, locus, j) : cnf.get_mm(progr_id_ind, locus, j);
cnf.add_clause<3>((lit_t[]){ s, p, -c});
cnf.add_clause<3>((lit_t[]){ s, -p, c});
cnf.add_clause<3>((lit_t[]){-s, m, -c});
cnf.add_clause<3>((lit_t[]){-s, -m, c});
cnf.add_clause<3>((lit_t[]){-p, -m, c});
cnf.add_clause<3>((lit_t[]){ p, m, -c});
}
/************
* End clauses for Mendelian consistency (s-variables)
************/
/************
* Clauses for Recombination events (r-variables)
************/
if (locus>0) {
lit_t prevs= (!is_mother) ?
cnf.get_sp(progr_id_ind, locus-1) : cnf.get_sm(progr_id_ind, locus-1);
lit_t r= (!is_mother) ?
cnf.get_rp(progr_id_ind, locus) : cnf.get_rm(progr_id_ind, locus);
// s == prevs + r
#ifdef AVOID_XOR_CLAUSES
cnf.add_clause<3>((lit_t[]){-s, r, prevs});
cnf.add_clause<3>((lit_t[]){-s, -r, -prevs});
cnf.add_clause<3>((lit_t[]){ s, r, -prevs});
cnf.add_clause<3>((lit_t[]){ s, -r, prevs});
#else
cnf.add_xor_clause<3>((lit_t[]){-s, r, prevs});
#endif
}
/************
* End clauses for Recombination events (r-variables)
************/
};

public:

void convert(const pedigree_t& ped, pedcnf_t& cnf) {
// Compute the number of alleles of each locus
const size_t* const max_alleles= ped.no_of_alleles();
BOOST_FOREACH( const individual_t& ind,
ped.individuals() ) {
L_TRACE("Considering individual " << ind.progr_id());
prepare_individual(cnf, ped, ind);
for (size_t l= 0; l < ped.genotype_length(); ++l) {
L_TRACE(" locus = " << l <<
", g_i = " << ind.obs_g(l));
add_individual_constraint(cnf, ind, ind.obs_g(l), l, ind.progr_id());
if (max_alleles[l] <= 2) {
add_individual_constraint_bi(cnf, ind, ind.obs_g(l), l, ind.progr_id());
} else {
add_individual_constraint_multi(cnf, ind, ind.obs_g(l), l, ind.progr_id(), max_alleles[l]);
}
}
if (ind.has_father()) {
L_TRACE(" --> father " << ind.father().progr_id());
Expand All @@ -262,7 +448,7 @@ class ped2cnf_conv_t:
parent.obs_g(l), ind.obs_g(l),
l,
parent.progr_id(), ind.progr_id(),
false);
false, max_alleles[l]);
}
}
if (ind.has_mother()) {
Expand All @@ -276,10 +462,11 @@ class ped2cnf_conv_t:
parent.obs_g(l), ind.obs_g(l),
l,
parent.progr_id(), ind.progr_id(),
true);
true, max_alleles[l]);
}
}
}
delete [] max_alleles;
};
};

Expand Down
4 changes: 4 additions & 0 deletions include/pedcnf.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,10 @@ class pedcnf_t
add_clause(clause_t(clause, clause+LEN));
};

void add_clause(const lit_t* const clause, const size_t LEN) {
add_clause(clause_t(clause, clause+LEN));
};

#ifndef AVOID_XOR_CLAUSES
template <int LEN>
void add_xor_clause(const lit_t* const clause) {
Expand Down
48 changes: 21 additions & 27 deletions include/pedcnf2hc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ void compute_reHC_from_SAT(basic_pedigree_t<T_GENOTYPE, T_HAPLOTYPE, T_PHENOTYPE
log4cxx::LoggerPtr logger(log4cxx::Logger::getLogger("pedcnf2hc"));
typedef basic_pedigree_t<T_GENOTYPE, T_HAPLOTYPE, T_PHENOTYPE, T_ID> family_t;
INFO("Computing the (r,e)-haplotype configuration...");
const size_t* const no_of_alleles= ped.no_of_alleles();
size_t no_of_errors= 0;
size_t no_of_imputation= 0;
// For each locus in each individual:
Expand All @@ -66,27 +67,27 @@ void compute_reHC_from_SAT(basic_pedigree_t<T_GENOTYPE, T_HAPLOTYPE, T_PHENOTYPE
ped.individuals() ) {
TRACE("Considering individual " << ind.progr_id());
for (size_t locus= 0; locus < ped.genotype_length(); ++locus) {
const bool pil= cnf.p(ind.progr_id(), locus);
const bool mil= cnf.m(ind.progr_id(), locus);
allele_t gp, gm;
if (no_of_alleles[locus] <= 2) {
gp = !cnf.p(ind.progr_id(), locus) ? 1 : 2;
gm = !cnf.m(ind.progr_id(), locus) ? 1 : 2;
} else {
for (gp= 0; gp<no_of_alleles[locus] && !cnf.pm(ind.progr_id(), locus, gp); ++gp);
MY_ASSERT(gp < no_of_alleles[locus]);
gp += 1;
for (gm= 0; gm<no_of_alleles[locus] && !cnf.mm(ind.progr_id(), locus, gm); ++gm);
MY_ASSERT(gm < no_of_alleles[locus]);
gm += 1;
}
if ( ! is_genotyped(ind.obs_g(locus)) ) {
// Individual not genotyped ->
// -> imputing genotype based on variables p_i_l and m_i_l
TRACE("Individual " << ind.progr_id() << " at locus " << locus
<< " is not genotyped.");
TRACE("pil " << pil << " mil " << mil);
if ( pil == mil ) {
DEBUG("Not-genotyped individual " << ind.progr_id() <<
" at locus " << locus << " is imputed as homozygous.");
if ( !pil ) {
ind.real_g(locus).set_alleles(1, 1);
} else {
ind.real_g(locus).set_alleles(2, 2);
}
} else {
DEBUG("Not-genotyped individual " << ind.progr_id() <<
" at locus " << locus << " is imputed as heterozygous.");
ind.real_g(locus).set_alleles(1, 2);
}
TRACE("pil " << gp << " mil " << gm);
DEBUG("Not-genotyped individual " << ind.progr_id() <<
" at locus " << locus << " is imputed as (" << gp << ", " << gm << ").");
ind.real_g(locus).set_alleles(gp, gm);
++no_of_imputation;
} else {
ind.real_g(locus)= ind.obs_g(locus);
Expand All @@ -95,18 +96,10 @@ void compute_reHC_from_SAT(basic_pedigree_t<T_GENOTYPE, T_HAPLOTYPE, T_PHENOTYPE
// Impossible: we performed genotype imputation in the previous step
MY_FAIL;
} else {
ind.hp(locus)= (!pil) ? (family_t::h::ALLELE(1)) : (family_t::h::ALLELE(2));
ind.hm(locus)= (!mil) ? (family_t::h::ALLELE(1)) : (family_t::h::ALLELE(2));
ind.hp(locus)= family_t::h::ALLELE(gp);
ind.hm(locus)= family_t::h::ALLELE(gm);
typename family_t::g real_g= ind.real_g(locus);
if ( pil == mil ) {
if ( !pil ) {
real_g.set_alleles(1, 1);
} else {
real_g.set_alleles(2, 2);
}
} else {
real_g.set_alleles(1, 2);
}
real_g.set_alleles(gp, gm);
if (real_g != ind.real_g(locus)) {
// Not-genotyped loci cannot have errors
MY_ASSERT( is_genotyped(ind.obs_g(locus)) );
Expand All @@ -120,6 +113,7 @@ void compute_reHC_from_SAT(basic_pedigree_t<T_GENOTYPE, T_HAPLOTYPE, T_PHENOTYPE
}
}
}
delete [] no_of_alleles;
INFO("Number of imputed genotypes: " << no_of_imputation);
INFO("Number of corrected errors: " << no_of_errors);
INFO("(r,e)-haplotype configuration successfully computed.");
Expand Down
15 changes: 15 additions & 0 deletions include/pedigree.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,21 @@ class basic_pedigree_t: boost::noncopyable {
return _len;
}

const size_t* const no_of_alleles() const {
size_t* max_alleles= new size_t[genotype_length()];
for (size_t l= 0; l < genotype_length(); ++l) {
max_alleles[l]= 0;
}
BOOST_FOREACH( const individual_t& ind,
individuals() ) {
for (size_t l= 0; l < genotype_length(); ++l) {
// Remember that allele1() <= allele2()
max_alleles[l]= std::max(max_alleles[l], (size_t)ind.obs_g(l).allele2());
}
}
return max_alleles;
};

bool is_completely_haplotyped() const {
DEBUG("Checking if the haplotype configuration is complete...");
BOOST_FOREACH( const individual_t& ind,
Expand Down

0 comments on commit 7faad5d

Please sign in to comment.