gmmmap.py

#!/usr/bin/python
# coding: utf-8

import numpy as np
from numpy import linalg
from sklearn.mixture import GMM
import scipy.linalg
import scipy.sparse
import scipy.sparse.linalg

class GMMMap:
    """GMM-based frame-by-frame speech parameter mapping. 

    GMMMap represents a class to transform spectral features of a source
    speaker to that of a target speaker based on Gaussian Mixture Models
    of source and target joint spectral features.
    
    Notation
    --------
    Source speaker's feature: X = {x_t}, 0 <= t < T
    Target speaker's feature: Y = {y_t}, 0 <= t < T
    where T is the number of time frames.

    Parameters
    ----------
    gmm : scipy.mixture.GMM
        Gaussian Mixture Models of source and target joint features
    
    swap : bool
        True: source -> target
        False target -> source
    
    Attributes
    ----------
    num_mixtures : int
        the number of Gaussian mixtures

    weights : array, shape (`num_mixtures`)
        weights for each gaussian

    src_means : array, shape (`num_mixtures`, `order of spectral feature`)
        means of GMM for a source speaker

    tgt_means : array, shape (`num_mixtures`, `order of spectral feature`)
        means of GMM for a target speaker

    covarXX : array, shape (`num_mixtures`, `order of spectral feature`, 
        `order of spectral feature`)
        variance matrix of source speaker's spectral feature

    covarXY : array, shape (`num_mixtures`, `order of spectral feature`, 
        `order of spectral feature`)
        covariance matrix of source and target speaker's spectral feature

    covarYX : array, shape (`num_mixtures`, `order of spectral feature`, 
        `order of spectral feature`)
        covariance matrix of target and source speaker's spectral feature

    covarYY : array, shape (`num_mixtures`, `order of spectral feature`, 
        `order of spectral feature`)
        variance matrix of target speaker's spectral feature
    
    D : array, shape (`num_mixtures`, `order of spectral feature`, 
        `order of spectral feature`)
        covariance matrices of target static spectral features

    px : scipy.mixture.GMM
        Gaussian Mixture Models of source speaker's features

    Reference
    ---------
      - [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation
        of Spectral Parameter Trajectory.
        http://isw3.naist.jp/~tomoki/Tomoki/Journals/IEEE-Nov-2007_MLVC.pdf

    """
    def __init__(self, gmm, swap=False):
        # D is the order of spectral feature for a speaker
        self.num_mixtures, D = gmm.means_.shape[0], gmm.means_.shape[1]/2
        self.weights = gmm.weights_

        # Split source and target parameters from joint GMM
        self.src_means = gmm.means_[:, 0:D]
        self.tgt_means = gmm.means_[:, D:]
        self.covarXX = gmm.covars_[:, :D, :D]
        self.covarXY = gmm.covars_[:, :D, D:]
        self.covarYX = gmm.covars_[:, D:, :D]
        self.covarYY = gmm.covars_[:, D:, D:]

        # swap src and target parameters
        if swap:
            self.tgt_means, self.src_means = self.src_means, self.tgt_means
            self.covarYY, self.covarXX = self.covarXX, self.covarYY
            self.covarYX, self.covarXY = self.covarXY, self.covarYX

        # Compute D eq.(12) in [Toda 2007]
        self.D = np.zeros(self.num_mixtures*D*D).reshape(self.num_mixtures, D, D)
        for m in range(self.num_mixtures):
            xx_inv_xy = np.linalg.solve(self.covarXX[m], self.covarXY[m])
            self.D[m] = self.covarYY[m] - np.dot(self.covarYX[m], xx_inv_xy)

        # p(x), which is used to compute posterior prob. for a given source
        # spectral feature in mapping stage.
        self.px = GMM(n_components=self.num_mixtures, covariance_type="full")
        self.px.means_ = self.src_means
        self.px.covars_ = self.covarXX
        self.px.weights_ = self.weights

    def convert(self, src):
        """
        Mapping source spectral feature x to target spectral feature y 
        so that minimize the mean least squared error.
        More specifically, it returns the value E(p(y|x)].

        Parameters
        ----------
        src : array, shape (`order of spectral feature`)
            source speaker's spectral feature that will be transformed

        Return
        ------
        converted spectral feature
        """
        D = len(src)

        # Eq.(11)
        E = np.zeros((self.num_mixtures, D))
        for m in range(self.num_mixtures):
            xx = np.linalg.solve(self.covarXX[m], src - self.src_means[m])
            E[m] = self.tgt_means[m] + self.covarYX[m].dot(xx)
                
        # Eq.(9) p(m|x)
        posterior = self.px.predict_proba(np.atleast_2d(src))

        # Eq.(13) conditinal mean E[p(y|x)]
        return posterior.dot(E)
            
class TrajectoryGMMMap(GMMMap):
    """
    Trajectory-based speech parameter mapping for voice conversion
    based on the maximum likelihood criterion.

    Parameters
    ----------
    gmm : scipy.mixture.GMM
        Gaussian Mixture Models of source and target speaker joint features

    gv : scipy.mixture.GMM (default=None)
        Gaussian Mixture Models of target speaker's global variance of spectral
        feature
    
    swap : bool (default=False)
        True: source -> target
        False target -> source

    Attributes
    ----------
    TODO 

    Reference
    ---------
      - [Toda 2007] Voice Conversion Based on Maximum Likelihood Estimation
        of Spectral Parameter Trajectory.
        http://isw3.naist.jp/~tomoki/Tomoki/Journals/IEEE-Nov-2007_MLVC.pdf
    """
    def __init__(self, gmm, T, gv=None, swap=False):
        GMMMap.__init__(self, gmm, swap)

        self.T = T
        # shape[1] = d(src) + d(src_delta) + d(tgt) + d(tgt_delta)
        D = gmm.means_.shape[1] / 4

        ## Setup for Trajectory-based mapping
        self.__construct_weight_matrix(T, D)

        ## Setup for GV post-filtering
        # It is assumed that GV is modeled as a single mixture GMM
        if gv != None:
            self.gv_mean = gv.means_[0]
            self.gv_covar = gv.covars_[0]
            self.Pv = np.linalg.inv(self.gv_covar)

    def __construct_weight_matrix(self, T, D):
        # Construct Weight matrix W
        # Eq.(25) ~ (28)
        for t in range(T):
            w0 = scipy.sparse.lil_matrix((D, D*T))
            w1 = scipy.sparse.lil_matrix((D, D*T))
            w0[0:,t*D:(t+1)*D] = scipy.sparse.diags(np.ones(D), 0)

            if t-1 >= 0:
                tmp = np.zeros(D)
                tmp.fill(-0.5)
                w1[0:,(t-1)*D:t*D] = scipy.sparse.diags(tmp, 0)
            if t+1 < T:
                tmp = np.zeros(D)
                tmp.fill(0.5)
                w1[0:,(t+1)*D:(t+2)*D] = scipy.sparse.diags(tmp, 0)

            W_t = scipy.sparse.vstack([w0, w1])

            # Slower
            # self.W[2*D*t:2*D*(t+1),:] = W_t

            if t == 0:
                self.W = W_t
            else:
                self.W = scipy.sparse.vstack([self.W, W_t])

        self.W = scipy.sparse.csr_matrix(self.W)

        assert self.W.shape == (2*D*T, D*T)
        
    def convert(self, src):
        """
        Mapping source spectral feature x to target spectral feature y 
        so that maximize the likelihood of y given x.

        Parameters
        ----------
        src : array, shape (`the number of frames`, `the order of spectral feature`)
            a sequence of source speaker's spectral feature that will be
            transformed

        Return
        ------
        a sequence of transformed spectral features
        """
        T, D = src.shape[0], src.shape[1]/2

        if T != self.T:
            self.__construct_weight_matrix(T, D)

        # A suboptimum mixture sequence  (eq.37)
        optimum_mix = self.px.predict(src)

        # Compute E eq.(40)
        self.E = np.zeros((T, 2*D))
        for t in range(T):
            m = optimum_mix[t] # estimated mixture index at time t
            xx = np.linalg.solve(self.covarXX[m], src[t] - self.src_means[m])
            # Eq. (22)
            self.E[t] = self.tgt_means[m] + np.dot(self.covarYX[m], xx)
        self.E = self.E.flatten()

        # Compute D eq.(41). Note that self.D represents D^-1.
        self.D = np.zeros((T, 2*D, 2*D))
        for t in range(T):
            m = optimum_mix[t]
            xx_inv_xy = np.linalg.solve(self.covarXX[m], self.covarXY[m])
            # Eq. (23)
            self.D[t] = self.covarYY[m] - np.dot(self.covarYX[m], xx_inv_xy)
            self.D[t] = np.linalg.inv(self.D[t])
        self.D = scipy.linalg.block_diag(*self.D)

        # represent D as a sparse matrix
        self.D = scipy.sparse.csr_matrix(self.D)

        # Compute target static features
        # eq.(39)
        covar = self.W.T.dot(self.D.dot(self.W))
        y = scipy.sparse.linalg.spsolve(covar, self.W.T.dot(self.D.dot(self.E)),\
                                        use_umfpack=False)
        return y.reshape((T, D))