Skip to main content

KL divergence of two Gaussian distributions

For a multivariate Guassian random variable xN(μ,Σ)\bold{x} \sim \mathcal{N}(\boldsymbol{\mu}, \Sigma),the probability density function is

p(x)=1(2π)n/2Σ1/2exp(12(xμ)TΣ1(xμ))p(\bold{x}) = \frac{1}{(2\pi)^{n/2} |\Sigma|^{1/2}} \exp(-\frac{1}{2}(\bold{x} - \boldsymbol{\mu})^T \Sigma^{-1} (\bold{x} - \boldsymbol{\mu}))
  • μ\boldsymbol{\mu} is n-dimensional mean vector
  • Σ\Sigma is n×nn\times n covariance matrix, Σ=E[(xμ)(xμ)T]\Sigma = \mathbb{E}[(\bold{x} - \boldsymbol{\mu})(\bold{x} - \boldsymbol{\mu})^T]

Two multivariate Guassian distributions p(x)=N(μ1,Σ1)p(\bold{x}) = \mathcal{N}(\boldsymbol{\mu}_1, \Sigma_1) and q(x)=N(μ2,Σ2)q(\bold{x}) = \mathcal{N}(\boldsymbol{\mu}_2, \Sigma_2).

DKL(p(x)q(x))=Ep(x)[logp(x)q(x)]=Ep(x)[logp(x)logq(x)]=Ep(x)[log(2π)n/212logΣ112(xμ1)TΣ11(xμ1)+log(2π)n/2+12logΣ2+12(xμ2)TΣ21(xμ2)]=12(logΣ2Σ1Ep(x)[(xμ1)TΣ11(xμ1)]+Ep(x)[(xμ2)TΣ21(xμ2)])=12(logΣ2Σ1Ep(x)[tr((xμ1)TΣ11(xμ1))]+Ep(x)[tr((xμ2)TΣ21(xμ2))])=12(logΣ2Σ1Ep(x)[tr(Σ11(xμ1)(xμ1)T)]+Ep(x)[tr(Σ21(xμ2)(xμ2)T)])=12(logΣ2Σ1tr(Σ11Ep(x)[(xμ1)(xμ1)T])+tr(Σ21Ep(x)[(xμ2)(xμ2)T]))=12(logΣ2Σ1tr(Σ11Σ1)+tr(Σ21Ep(x)[(xxT2xμ2T+μ2μ2T)]))=12(logΣ2Σ1n+tr(Σ21Ep(x)[(Σ1+2xμ1Tμ1μ1T2xμ2T+μ2μ2T)]))=12(logΣ2Σ1n+tr(Σ21(Σ1+2μ1μ1Tμ1μ1T2μ1μ2T+μ2μ2T)))=12(logΣ2Σ1n+tr(Σ21Σ1)+tr(Σ21(μ1μ1T2μ1μ2T+μ2μ2T)))=12(logΣ2Σ1n+tr(Σ21Σ1)+tr(Σ21(μ1μ2)(μ1μ2)T))=12(logΣ2Σ1n+tr(Σ21Σ1)+tr((μ1μ2)TΣ21(μ1μ2)))=12(logΣ2Σ1n+tr(Σ21Σ1)+(μ1μ2)TΣ21(μ1μ2))\begin{aligned} D_{KL}(p(\bold{x}) || q(\bold{x})) &= \mathbb{E}_{p(\bold{x})} \Bigg[\log \frac{p(\bold{x})}{q(\bold{x})}\Bigg]\\ &= \mathbb{E}_{p(\bold{x})} \Big[\log p(\bold{x}) - \log q(\bold{x})\Big]\\ &= \mathbb{E}_{p(\bold{x})} \Big[- \log (2\pi)^{n/2} - \frac{1}{2}\log |\Sigma_1|-\frac{1}{2}(\bold{x} - \boldsymbol{\mu}_1)^T \Sigma_1^{-1} (\bold{x} - \boldsymbol{\mu}_1)\\ &\quad + \log (2\pi)^{n/2} + \frac{1}{2}\log |\Sigma_2|+\frac{1}{2}(\bold{x} - \boldsymbol{\mu}_2)^T \Sigma_2^{-1} (\bold{x} - \boldsymbol{\mu}_2) \Big]\\ &= \frac{1}{2} \Bigg(\log \frac{|\Sigma_2|}{|\Sigma_1|}-\mathbb{E}_{p(\bold{x})}\Big[(\bold{x} - \boldsymbol{\mu}_1)^T \Sigma_1^{-1} (\bold{x} - \boldsymbol{\mu}_1) \Big] \\ &\quad+\mathbb{E}_{p(\bold{x})} \Big[ (\bold{x} - \boldsymbol{\mu}_2)^T \Sigma_2^{-1} (\bold{x} - \boldsymbol{\mu}_2)\Big] \Bigg)\\ &= \frac{1}{2} \Bigg(\log \frac{|\Sigma_2|}{|\Sigma_1|} -\mathbb{E}_{p(\bold{x})} \Big[tr\Big((\bold{x} - \boldsymbol{\mu}_1)^T \Sigma_1^{-1} (\bold{x} - \boldsymbol{\mu}_1)\Big) \Big] \\ &\quad +\mathbb{E}_{p(\bold{x})} \Big[tr\Big((\bold{x} - \boldsymbol{\mu}_2)^T \Sigma_2^{-1} (\bold{x} - \boldsymbol{\mu}_2)\Big) \Big] \Bigg) \\ &= \frac{1}{2} \Bigg(\log \frac{|\Sigma_2|}{|\Sigma_1|} -\mathbb{E}_{p(\bold{x})} \Big[tr\Big(\Sigma_1^{-1} (\bold{x} - \boldsymbol{\mu}_1)(\bold{x} - \boldsymbol{\mu}_1)^T \Big) \Big] \\ &\quad +\mathbb{E}_{p(\bold{x})} \Big[tr \Big(\Sigma_2^{-1} (\bold{x} - \boldsymbol{\mu}_2)(\bold{x} - \boldsymbol{\mu}_2)^T \Big) \Big] \Bigg) \\ &= \frac{1}{2} \Bigg(\log \frac{|\Sigma_2|}{|\Sigma_1|} -tr \Big( \Sigma_1^{-1} \mathbb{E}_{p(\bold{x})}\Big[ (\bold{x} - \boldsymbol{\mu}_1)(\bold{x} - \boldsymbol{\mu}_1)^T \Big]\Big) \\ &\quad +tr \Big( \Sigma_2^{-1} \mathbb{E}_{p(\bold{x})}\Big[ (\bold{x} - \boldsymbol{\mu}_2)(\bold{x} - \boldsymbol{\mu}_2)^T \Big]\Big) \Bigg) \\ &= \frac{1}{2} \Bigg( \log \frac{|\Sigma_2|}{|\Sigma_1|} - tr(\Sigma_1^{-1} \Sigma_1) \\ &\quad + tr \Big( \Sigma_2^{-1} \mathbb{E}_{p(\bold{x})}\Big[ (\bold{x}\bold{x}^T - 2 \bold{x} \boldsymbol{\mu}_2^T + \boldsymbol{\mu}_2 \boldsymbol{\mu}_2^T) \Big]\Big) \Bigg) \\ &= \frac{1}{2} \Bigg( \log \frac{|\Sigma_2|}{|\Sigma_1|} - n \\ &\quad + tr \Big( \Sigma_2^{-1} \mathbb{E}_{p(\bold{x})}\Big[ (\Sigma_1+2\bold{x}\boldsymbol{\mu}_1^T - \boldsymbol{\mu}_1 \boldsymbol{\mu}_1^T - 2 \bold{x} \boldsymbol{\mu}_2^T + \boldsymbol{\mu}_2 \boldsymbol{\mu}_2^T) \Big]\Big) \Bigg) \\ &= \frac{1}{2} \Bigg( \log \frac{|\Sigma_2|}{|\Sigma_1|} - n \\ &\quad + tr \Big( \Sigma_2^{-1} (\Sigma_1 + 2 \boldsymbol{\mu}_1 \boldsymbol{\mu}_1^T - \boldsymbol{\mu}_1 \boldsymbol{\mu}_1^T - 2 \boldsymbol{\mu}_1 \boldsymbol{\mu}_2^T + \boldsymbol{\mu}_2 \boldsymbol{\mu}_2^T) \Big) \Bigg) \\ &= \frac{1}{2} \Bigg( \log \frac{|\Sigma_2|}{|\Sigma_1|} - n \\ &\quad + tr (\Sigma_2^{-1} \Sigma_1) + tr \Big( \Sigma_2^{-1} ( \boldsymbol{\mu}_1 \boldsymbol{\mu}_1^T - 2 \boldsymbol{\mu}_1 \boldsymbol{\mu}_2^T + \boldsymbol{\mu}_2 \boldsymbol{\mu}_2^T) \Big) \Bigg) \\ &= \frac{1}{2} \Bigg( \log \frac{|\Sigma_2|}{|\Sigma_1|} - n \\ &\quad + tr (\Sigma_2^{-1} \Sigma_1) + tr \Big( \Sigma_2^{-1} (\boldsymbol{\mu}_1 - \boldsymbol{\mu}_2) (\boldsymbol{\mu}_1 - \boldsymbol{\mu}_2)^T \Big) \Bigg) \\ &=\frac{1}{2} \Bigg( \log \frac{|\Sigma_2|}{|\Sigma_1|} - n \\ &\quad + tr (\Sigma_2^{-1} \Sigma_1) + tr \Big( (\boldsymbol{\mu}_1 - \boldsymbol{\mu}_2)^T \Sigma_2^{-1} (\boldsymbol{\mu}_1 - \boldsymbol{\mu}_2) \Big) \Bigg) \\ &= \frac{1}{2} \Bigg( \log \frac{|\Sigma_2|}{|\Sigma_1|} - n + tr (\Sigma_2^{-1} \Sigma_1) + (\boldsymbol{\mu}_1 - \boldsymbol{\mu}_2)^T \Sigma_2^{-1} (\boldsymbol{\mu}_1 - \boldsymbol{\mu}_2) \Bigg) \end{aligned}

From