paradiseo/contribution/branches/PhyloMOEA/PhyloMOEA-serial/PhyloMOEA/likelihoodcalculator.cpp

544 lines
14 KiB
C++

/***************************************************************************
* Copyright (C) 2005 by Waldo Cancino *
* wcancino@icmc.usp.br *
* *
* This program is free software; you can redistribute it and/or modify *
* it under the terms of the GNU General Public License as published by *
* the Free Software Foundation; either version 2 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU General Public License for more details. *
* *
* You should have received a copy of the GNU General Public License *
* along with this program; if not, write to the *
* Free Software Foundation, Inc., *
* 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
***************************************************************************/
#include "likelihoodcalculator.h"
#include "treeIterator.h"
#include <gsl/gsl_randist.h>
#include <gsl/gsl_cdf.h>
#define BL_MIN 1.e-10
//pthread_mutex_t mdone_mutex;
//extern "C" void *lik_slave(void *);
void *lik_slave_class(void *ind)
{
LikelihoodCalculator *t = (LikelihoodCalculator *)ind;
if(t->worker == 0)
{
// pthread_mutex_lock(&mdone_mutex);
t->worker = 1;
// pthread_mutex_unlock(&mdone_mutex);
t->calculate_partials( t->a, &(t->b));
}
else
{
// pthread_mutex_lock(&mdone_mutex);
t->worker = 0;
// pthread_mutex_unlock(&mdone_mutex);
t->calculate_partials( t->b, &(t->a));
}
return (NULL);
}
void *lik_slave(void *ind)
{
LikelihoodCalculator *t = (LikelihoodCalculator *)ind;
lik_slave_class(t);
return NULL;
}
LikelihoodCalculator::LikelihoodCalculator( phylotreeIND &ind, SubstModel &m, ProbMatrixContainer &p, unsigned int nr)
{
if(nr<1)nrates=1;
else nrates=nr;
alpha = 2.0;
part_memory_internal = part_memory_taxons = NULL;
SeqData = NULL; rates_prob = rates_freq = NULL; prob = NULL;
model = &m;
probmatrixs = &p;
invalid_partials_taxons = true;
set_data( ind.get_patterns() );
set_tree(ind);
build_rates_sites();
}
LikelihoodCalculator::LikelihoodCalculator( phylotreeIND &ind, Sequences &Seqs, SubstModel &m, ProbMatrixContainer &p, unsigned int nr)
{
part_memory_internal = part_memory_taxons = NULL;
rates_prob = rates_freq = NULL; prob = NULL;
if(nr<1)nrates=1;
else nrates=nr;
alpha = 2.0;
SeqData = NULL;
model = &m;
probmatrixs = &p;
invalid_partials_taxons = true;
set_data( Seqs);
set_tree(ind);
build_rates_sites();
}
void LikelihoodCalculator::build_rates_sites()
{
if(rates_prob!=NULL) delete [] rates_prob;
if(rates_freq!=NULL) delete [] rates_freq;
if(prob!=NULL) delete [] prob;
rates_prob = new double[nrates];
prob = new double*[nrates]; // points rate matrixs
rates_freq= new double[nrates];
if(nrates==1)
{
rates_prob[0] = rates_freq[0] = 1.0;
return;
}
/*
double mean = 0.0;
for (int i = 0; i < nrates; i++)
{
rates_prob[i] = gsl_cdf_gamma_Pinv((2.0*i+1.0)/(2.0*nrates), alpha, (1.0)/alpha);
cout << "quantile : " << (2.0*i+1.0)/(2.0*nrates) << " " << rates_prob[i] << endl;
mean += rates_prob[i];
}
mean = mean/(double) nrates;
cout << "mean : " << mean << endl;
for (int i = 0; i < nrates; i++)
{
rates_prob[i] /= mean;
rates_freq[i] = 1.0/(double) nrates;
}
*/
DiscreteGamma(rates_freq, rates_prob, alpha, alpha, nrates, 0); // 0 = phyml 2.4
/* for (int i=0; i<nrates; i++) cout << "rate " << i << " " << rates_prob[i] << " " << rates_freq[i] << endl;
int k;*/
//std::cin >> k;
}
// change the default data
void LikelihoodCalculator::set_data( const Sequences &seqs)
{
// first assignment
if( SeqData == NULL )
{
SeqData = (Sequences *)&seqs;
allocate_partials(); // first assignment, allocate memoryAlleleString
}
else
// if the data changes, invalidate partial taxons
{
if( SeqData != &seqs)
{
SeqData = (Sequences *)&seqs;
deallocate_partials();
// allocate again
//cout << "allocating partials" << endl;
allocate_partials();
// invalidate partial taxons
//cout << "warning.... changing patterns..." << endl;
invalid_partials_taxons = true;
}
}
}
void LikelihoodCalculator::set_tree( phylotreeIND &ind )
{
tree_ptr = &ind;
long total_pos = SeqData->pattern_count();
set_data( tree_ptr->get_patterns() );
Partials.init(tree_ptr->TREE);
graph::node_iterator it = tree_ptr->TREE.nodes_begin();
graph::node_iterator it2 = tree_ptr->TREE.nodes_end();
// allocate partial matrix for each pattern and internal node (taxon don't have partials)
for(int i=0, j=0; it!=it2; it++, j++)
{
Factors[*it] = part_memory_factors + j * total_pos;
if(!tree_ptr->istaxon(*it))
{
Partials[*it] = part_memory_internal + i * total_pos * nrates* 4;
i++;
}
}
}
double LikelihoodCalculator::calculate_likelihood_exp(edge focus)
{
double lik;
// pthread_t threads[2]; /* holds thread info */
// int ids[2];
worker = 0;
// select an internal node as root
bfocus = focus;
a = tree_ptr->istaxon(bfocus.target()) ? bfocus.source() : bfocus.target();
b = bfocus.opposite(a);
//cout << "iniciando partials" << endl;
tree_ptr->convert_graph_to_tree(a, NULL);
init_partials();
// traverse the tree and calculate the partials
/*for(int i=0; i<2; i++)
pthread_create(&threads[i], NULL, lik_slave_class, (void*)this);
for(int i=0; i<2; i++)
pthread_join(threads[i],NULL);*/
//cout << "calculando partials..." << endl;
calculate_partials( a, &b);
calculate_partials( b, &a);
//cout << "somando..." << endl;
// sum all partials
lik = sum_site_liks();
return lik;
}
// double LikelihoodCalculator::calculate_all_likelihoods()
// {
// oldroot = invalid_node;
// graph::edge_iterator it = tree_ptr->TREE.edges_begin();
// graph::edge_iterator it_end = tree_ptr->TREE.edges_end();
// while(it != it_end)
// {
//
// // select a new root
// bfocus = *it;
// a = tree_ptr->istaxon(bfocus.target()) ? bfocus.source() : bfocus.target();
// b = bfocus.opposite(a);
//
// for(int i=0; i< nrates; i++)
// {
// double len = tree_ptr->get_branch_length(bfocus)*rates_prob[i];
// len = len < BL_MIN ? BL_MIN : len;
// ProbMatrix &p = (*probmatrixs)[ len];
// prob[i] = p.p;
// }
//
//
// // make the new tree
// tree_ptr->convert_graph_to_tree(a, NULL);
//
// if( oldroot!=invalid_node)
// {
// update_partials(); //cj( oldroot, a, *it);
// cout << "likelihood ..." << sum_site_liks() <<endl;
// }
// else { //first iteration
// init_partials();
// calculate_partials( a, &b);
// calculate_partials( b, &a);
// cout << "likelihood ..." << sum_site_liks() << endl;
// }
//
// for(int i=0; i< nrates; i++)
// {
// double factor = rates_prob[i];
// double len = tree_ptr->get_branch_length(bfocus);
// len = len < BL_MIN ? BL_MIN : len;
// probmatrixs->change_matrix( len*factor, tree_ptr->get_branch_length(bfocus)*factor);
// }
//
// oldroot = a;
// ++it;
// }
// }
void LikelihoodCalculator::update_partials() //node *oldroot, node newroot, edge newedge)
{
// father of oldroot
node tmp = oldroot;
node father = bfocus.opposite(a);
do
{
recalculate_cj(tmp, father);
if(tmp == a) break;
tmp = tmp.in_edges_begin()->source();
}while(1);
}
// recalculate cj when t changes in one of his sons
// only work for trees
void LikelihoodCalculator::recalculate_cj(node n, node father)
{
int seqlen = tree_ptr->number_of_positions();
node::out_edges_iterator it;
node::out_edges_iterator it_end;
it = n.out_edges_begin();
it_end = n.out_edges_end();
node nodeaux;
for(int i=0; i<seqlen; i++)
{
(Factors)[n][i] = 0;
for(int r=0; r< nrates; r++)
for(int k=0; k < 4; k++)
(Partials)[n][i*nrates*4+r*4+k] = 1;
}
while( it != it_end )
{
nodeaux = it->target();
if(nodeaux !=father)calculate_node_partial( n, nodeaux, *it);
it++;
}
}
double LikelihoodCalculator::calculate_likelihood()
{
double lik;
// pthread_t threads[2]; /* holds thread info */
// int ids[2];
worker = 0;
// select an internal node as root
bfocus = *(tree_ptr->TREE.edges_begin());
a = tree_ptr->istaxon(bfocus.target()) ? bfocus.source() : bfocus.target();
b = bfocus.opposite(a);
init_partials();
// traverse the tree and calculate the partials
/*for(int i=0; i<2; i++)
pthread_create(&threads[i], NULL, lik_slave_class, (void*)this);
for(int i=0; i<2; i++)
pthread_join(threads[i],NULL);*/
//cout << "calculando partials..." << endl;
calculate_partials( a, &b);
calculate_partials( b, &a);
//cout << "somando..." << endl;
// sum all partials
lik = sum_site_liks();
return lik;
}
double LikelihoodCalculator::sum_site_liks( )
{
int seqlen = tree_ptr->number_of_positions();
register double lik = 0;
register double factor_correct;
double len;
for(int i=0; i< nrates; i++)
{
len = tree_ptr->get_branch_length(bfocus)*rates_prob[i];
len = len < BL_MIN ? BL_MIN : len;
ProbMatrix &p = (*probmatrixs)[ len];
prob[i] = p.p;
}
for(int i=0; i < seqlen; i++)
{
factor_correct = Factors[a][i] + Factors[b][i] ;
site_liks[i] = sum_partials(i);
lik += ( log(site_liks[i]) + factor_correct)* SeqData->pattern_count(i);
}
return lik;
}
double LikelihoodCalculator::sum_partials( int pos )
{
if(tree_ptr->istaxon( b))
{
return sum_partials_a_to_taxon( pos );
}
else return sum_partials_a_to_b_notaxon( pos);
}
double LikelihoodCalculator::sum_partials_a_to_taxon( int pos)
{
double sum = 0;
unsigned char *meaning;
char char_state_b = SeqData->pattern_pos( pos, tree_ptr->taxon_id( b) );
for(int i=0; i<nrates; i++)
{
int index = pos*nrates*4 + i*4;
for(int k=0; k < 4; k++)
{
// defined nucleotide
if( SeqData->is_defined(char_state_b) )
sum += rates_freq[i]*SeqData->frequence(k)* prob[i][k*4+char_state_b]* Partials[a][index+k];
// ambigous nucleotide
else if (SeqData->is_ambiguous( char_state_b) )
{
meaning = SeqData->ambiguos_meaning( char_state_b );
for(int l=0; l < 4; l++)
sum += rates_freq[i]*SeqData->frequence(k)* prob[i][k*4+l]* Partials[a][index+k] * meaning[l];
//Partials[b][pos*4+l];
}
// gap or undefined
else
{
for(int l=0; l < 4; l++)
sum += rates_freq[i]*SeqData->frequence(k)* prob[i][k*4+l]* Partials[a][index+k];
}
}
}
return sum;
}
double LikelihoodCalculator::sum_partials_a_to_b_notaxon( int pos)
{
double sum = 0;
for(int j=0; j<nrates; j++)
{
int index = pos*nrates*4 + j*4;
for(int k=0; k < 4; k++)
{
for(int l=0; l < 4; l++)
{
sum += rates_freq[j]*SeqData->frequence(k)* prob[j][k*4+l]
* Partials[a][index+k] * Partials[b][index+l];
}
}
}
return sum;
}
// calculate conditional likelihoods
void LikelihoodCalculator::calculate_node_partial( node father, node son, edge edgeaux)
{
register double sum;
double corr_factor;
long index;
double len;
register int seqlen = tree_ptr->number_of_positions();
for(int k=0; k<seqlen;k++)
{
index = k*nrates*4;
// accumulatre
Factors[father][k]+=Factors[son][k];
corr_factor = MDBL_MIN;
for(int r=0; r<nrates; r++)
{
len = tree_ptr->get_branch_length(edgeaux) * rates_prob[r];
len = (len < BL_MIN ? BL_MIN : len);
ProbMatrix &p = (*probmatrixs)[ len ];
for(int i=0; i < 4; i++)
{
sum = 0;
if(tree_ptr->istaxon( son))
{
unsigned char l=SeqData->pattern_pos( k, tree_ptr->taxon_id( son));
if(SeqData->is_defined( l) ) sum = p.p_ij_t( i, l );
else if(SeqData->is_ambiguous( l))
{
unsigned char *meaning = SeqData->ambiguos_meaning( l);
for(int j=0; j < 4; j++)
sum +=meaning[j]* p.p_ij_t( i, j );
//sum +=Partials[son][k*4+j]* p.p_ij_t( i, j );
}
else sum = 1;
}
else{
for(int j=0; j < 4; j++)
{
sum +=Partials[son][index+ r*4 +j]* p.p_ij_t( i, j );
}
}
Partials[father][index + r*4 +i] *= sum;
corr_factor = ( sum > corr_factor ? sum : corr_factor);
}
}
if( corr_factor < UMBRAL || corr_factor > (1./LIM_SCALE_VAL))
{
//cout << "escalado ..." << endl;
for(int r=0; r< nrates; r++)
for(int i=0; i<4; i++)
Partials[father][index + r*4+ i] /= corr_factor;
Factors[father][k] += log(corr_factor);
}
}
}
void LikelihoodCalculator::init_partials()
{
// initialize the values of C for each node
long seqlen = tree_ptr->number_of_positions();
int num_taxons = tree_ptr->number_of_taxons();
int num_internal_nodes = tree_ptr->TREE.number_of_nodes() - num_taxons;
long size_part_factors = seqlen*(num_taxons +num_internal_nodes);
long size_part_memory_internal = nrates*num_internal_nodes*seqlen*4;
// correction factors
//for(long i=0; i<seqlen; i++)
// factors[i] = 0;
for(long i=0; i< seqlen*(num_taxons +num_internal_nodes); i++)
part_memory_factors[i] = 0;
// init internal nodes
for(long i=0; i< size_part_memory_internal; i++)
part_memory_internal[i] = 1.0;
}
// calculate the values de C_j for all nodes
void LikelihoodCalculator::calculate_partials(node n, node *antecessor)
{
postorder_Iterator it = tree_ptr->postorder_begin( n, *antecessor);
while(*it!=n)
{
calculate_node_partial( it.ancestor(), *it, it.branch());
++it;
}
}