Skip to main content

Unbiasedness and Variance

OLS estimator is Unbiased

We have

b=(XX)1Xy,=(XX)1X(Xβ+ε),=β+(XX)1Xε,\begin{align*} \bold{b}&=\bold{(X'X)}^{-1}\bold{X'y},\\ &=\bold{(X'X)}^{-1}\bold{X'(X}\boldsymbol{\beta + \varepsilon)},\\ &=\boldsymbol{\beta} + \bold{(X'X)}^{-1}\bold{X'}\boldsymbol{\varepsilon},\\ \end{align*}

taking expectation both sides, we get

E[bX]=E[βX]+E[(XX)1XεX],=β+(XX)1XE[εX]=0,=β.\begin{align*} \mathbb{E}[\bold{b|X}]&=\mathbb{E}[\boldsymbol{\beta}|\bold{X}] + \mathbb{E}[\bold{(X'X)}^{-1}\bold{X'}\boldsymbol{\varepsilon}|\bold{X}],\\ &=\boldsymbol{\beta} + \bold{(X'X)}^{-1}\bold{X'}\underbrace{\mathbb{E}[\boldsymbol{\varepsilon}|\bold{X}]}_{=0},\\ &= \boldsymbol{\beta}. \end{align*}

Variance of OLS estimator

We have

b=β+(XX)1X=Aε,=β+Aε,    bβ=Aε.\begin{align*} \bold{b}&=\boldsymbol{\beta} + \underbrace{\bold{(X'X)}^{-1}\bold{X'}}_{=\bold{A}}\boldsymbol{\varepsilon},\\ &=\boldsymbol{\beta} + \bold{A}\boldsymbol{\varepsilon},\\ \implies \bold{b}-\boldsymbol{\beta}&=\bold{A}\boldsymbol{\varepsilon}. \end{align*}

Since b\bold{b} is a vector of dimension (K×1)(K \times 1)

Var(b)=[Var(b0)Cov(b0,b1)Cov(b0,b2)...Cov(b0,bK)Cov(b1,b0)Var(b1)Cov(b1,b2)...Cov(b1,bK)Cov(b2,b0)Cov(b2,b1)Var(b2)...Cov(b2,bK).....................Cov(bK,b0)Cov(bK,b1)Cov(bK,b2)...Var(bK)](K×K),=[E[(b0β0)2]E[(b0β0)(b1β1)]E[(b0β0)(b2β2)]...E[(b0β0)(bKβK)]E[(b1β1)(b0β0)]E[(b1β1)2]E[(b1β1)(b2β2)]...E[(b1β1)(bKβK)]E[(b2β2)(b0β0)]E[(b2β2)(b1β1)]E[(b2β2)2]...E[(b2β2)(bKβK)].....................E[(bKβK)(b0β0)]E[(bKβK)(b1β1)]E[(bKβK)(b2β2)]...E[(bKβK)2]](K×K),=E[[(b0β0)2][(b0β0)(b1β1)][(b0β0)(b2β2)]...[(b0β0)(bKβK)][(b1β1)(b0β0)][(b1β1)2][(b1β1)(b2β2)]...[(b1β1)(bKβK)][(b2β2)(b0β0)][(b2β2)(b1β1)][(b2β2)2]...[(b2β2)(bKβK)].....................[(bKβK)(b0β0)][(bKβK)(b1β1)][(bKβK)(b2β2)]...[(bKβK)2]](K×K),=E[(bβ)(bβ)],=E[AεεA].\begin{align*} \mathbb{Var}(\bold{b})&=\begin{bmatrix} \mathbb{Var}(b_0) & \mathbb{Cov}(b_0,b_1) & \mathbb{Cov}(b_0,b_2)&...&\mathbb{Cov}(b_0,b_K)\\ \mathbb{Cov}(b_1,b_0) & \mathbb{Var}(b_1) & \mathbb{Cov}(b_1,b_2)&...&\mathbb{Cov}(b_1,b_K)\\ \mathbb{Cov}(b_2,b_0) & \mathbb{Cov}(b_2,b_1) & \mathbb{Var}(b_2)&...&\mathbb{Cov}(b_2,b_K)\\ .&.&.&...&.\\ .&.&.&...&.\\ .&.&.&...&.\\ \mathbb{Cov}(b_K,b_0) & \mathbb{Cov}(b_K,b_1) & \mathbb{Cov}(b_K,b_2)&...&\mathbb{Var}(b_K)\\ \end{bmatrix}_{(K \times K)},\\ &= \begin{bmatrix} \mathbb{E}[(b_0-\beta_0)^2] & \mathbb{E}[(b_0-\beta_0)(b_1-\beta_1)] & \mathbb{E}[(b_0-\beta_0)(b_2-\beta_2)]&...&\mathbb{E}[(b_0-\beta_0)(b_K-\beta_K)]\\ \mathbb{E}[(b_1-\beta_1)(b_0-\beta_0)] & \mathbb{E}[(b_1-\beta_1)^2] & \mathbb{E}[(b_1-\beta_1)(b_2-\beta_2)]&...&\mathbb{E}[(b_1-\beta_1)(b_K-\beta_K)]\\ \mathbb{E}[(b_2-\beta_2)(b_0-\beta_0)] & \mathbb{E}[(b_2-\beta_2)(b_1-\beta_1)] & \mathbb{E}[(b_2-\beta_2)^2]&...&\mathbb{E}[(b_2-\beta_2)(b_K-\beta_K)]\\ .&.&.&...&.\\ .&.&.&...&.\\ .&.&.&...&.\\ \mathbb{E}[(b_K-\beta_K)(b_0-\beta_0)] & \mathbb{E}[(b_K-\beta_K)(b_1-\beta_1)] & \mathbb{E}[(b_K-\beta_K)(b_2-\beta_2)]&...&\mathbb{E}[(b_K-\beta_K)^2]\\ \end{bmatrix}_{(K \times K)},\\ &=\mathbb{E} \begin{bmatrix} [(b_0-\beta_0)^2] & [(b_0-\beta_0)(b_1-\beta_1)] & [(b_0-\beta_0)(b_2-\beta_2)]&...&[(b_0-\beta_0)(b_K-\beta_K)]\\ [(b_1-\beta_1)(b_0-\beta_0)] & [(b_1-\beta_1)^2] & [(b_1-\beta_1)(b_2-\beta_2)]&...&[(b_1-\beta_1)(b_K-\beta_K)]\\ [(b_2-\beta_2)(b_0-\beta_0)] & [(b_2-\beta_2)(b_1-\beta_1)] & [(b_2-\beta_2)^2]&...&[(b_2-\beta_2)(b_K-\beta_K)]\\ .&.&.&...&.\\ .&.&.&...&.\\ .&.&.&...&.\\ [(b_K-\beta_K)(b_0-\beta_0)] & [(b_K-\beta_K)(b_1-\beta_1)] & [(b_K-\beta_K)(b_2-\beta_2)]&...&[(b_K-\beta_K)^2]\\ \end{bmatrix}_{(K \times K)},\\ &=\mathbb{E}[(\bold{b}-\boldsymbol{\beta})(\bold{b}-\boldsymbol{\beta})'],\\ &=\mathbb{E}[\bold{A}\boldsymbol{\varepsilon}\boldsymbol{\varepsilon}'\bold{A}']. \end{align*} Var(bX)=E[AεεAX],=AE[εεX]A.\begin{align*} \mathbb{Var}(\bold{b|X})&=\mathbb{E}[\bold{A}\boldsymbol{\varepsilon}\boldsymbol{\varepsilon'}\bold{A'|X}],\\ &=\bold{A}\mathbb{E}[\boldsymbol{\varepsilon}\boldsymbol{\varepsilon'}\bold{|X}]\bold{A'}.\\ \end{align*}

Given E[ε]=0\mathbb{E}[\boldsymbol{\varepsilon}]=0

E[εε]=[Var(ε1)Cov(ε1,ε2)Cov(ε1,ε3)...Cov(ε1,εn)Cov(ε2,ε1)Var(ε2)Cov(ε2,ε3)...Cov(ε2,εn)Cov(ε3,ε1)Cov(ε3,ε2)Var(ε3)...Cov(ε3,εn).....................Cov(εn,ε1)Cov(εn,ε2)Cov(εn,ε3)...Var(εn)](n×n),E[εεX]=[Var(ε1X)Cov(ε1,ε2X)Cov(ε1,ε3X)...Cov(ε1,εnX)Cov(ε2,ε1X)Var(ε2X)Cov(ε2,ε3X)...Cov(ε2,εnX)Cov(ε3,ε1X)Cov(ε3,ε2X)Var(ε3X)...Cov(ε3,εnX).....................Cov(εn,ε1X)Cov(εn,ε2X)Cov(εn,ε3X)...Var(εnX)](n×n),E[εεX]=[E[ε12X]E[ε1ε2X]E[ε1ε3X]...E[ε1εnX]E[ε2ε1X]E[ε22X]E[ε2ε3X]...E[ε2εnX]E[ε3ε1X]E[ε3ε2X]E[ε32X]...E[ε3εnX].....................E[εnε1X]E[εnε2X]E[εnε3X]...E[εn2X]](n×n).\begin{align*} \mathbb{E}[\boldsymbol{\varepsilon}\boldsymbol{\varepsilon'}]&=\begin{bmatrix} \mathbb{Var}(\varepsilon_1) & \mathbb{Cov}(\varepsilon_1,\varepsilon_2) & \mathbb{Cov}(\varepsilon_1,\varepsilon_3)&...&\mathbb{Cov}(\varepsilon_1,\varepsilon_n)\\ \mathbb{Cov}(\varepsilon_2,\varepsilon_1) & \mathbb{Var}(\varepsilon_2) & \mathbb{Cov}(\varepsilon_2,\varepsilon_3)&...&\mathbb{Cov}(\varepsilon_2,\varepsilon_n)\\ \mathbb{Cov}(\varepsilon_3,\varepsilon_1) & \mathbb{Cov}(\varepsilon_3,\varepsilon_2) & \mathbb{Var}(\varepsilon_3)&...&\mathbb{Cov}(\varepsilon_3,\varepsilon_n)\\ .&.&.&...&.\\ .&.&.&...&.\\ .&.&.&...&.\\ \mathbb{Cov}(\varepsilon_n,\varepsilon_1) & \mathbb{Cov}(\varepsilon_n,\varepsilon_2) & \mathbb{Cov}(\varepsilon_n,\varepsilon_3)&...&\mathbb{Var}(\varepsilon_n)\\ \end{bmatrix}_{(n \times n)},\\ \mathbb{E}[\boldsymbol{\varepsilon}\boldsymbol{\varepsilon'}\bold{|X}]&=\begin{bmatrix} \mathbb{Var}(\varepsilon_1|\bold{X}) & \mathbb{Cov}(\varepsilon_1,\varepsilon_2|\bold{X}) & \mathbb{Cov}(\varepsilon_1,\varepsilon_3|\bold{X})&...&\mathbb{Cov}(\varepsilon_1,\varepsilon_n|\bold{X})\\ \mathbb{Cov}(\varepsilon_2,\varepsilon_1|\bold{X}) & \mathbb{Var}(\varepsilon_2|\bold{X}) & \mathbb{Cov}(\varepsilon_2,\varepsilon_3|\bold{X})&...&\mathbb{Cov}(\varepsilon_2,\varepsilon_n|\bold{X})\\ \mathbb{Cov}(\varepsilon_3,\varepsilon_1|\bold{X}) & \mathbb{Cov}(\varepsilon_3,\varepsilon_2|\bold{X}) & \mathbb{Var}(\varepsilon_3|\bold{X})&...&\mathbb{Cov}(\varepsilon_3,\varepsilon_n|\bold{X})\\ .&.&.&...&.\\ .&.&.&...&.\\ .&.&.&...&.\\ \mathbb{Cov}(\varepsilon_n,\varepsilon_1|\bold{X}) & \mathbb{Cov}(\varepsilon_n,\varepsilon_2|\bold{X}) & \mathbb{Cov}(\varepsilon_n,\varepsilon_3|\bold{X})&...&\mathbb{Var}(\varepsilon_n|\bold{X})\\ \end{bmatrix}_{(n \times n)},\\ \mathbb{E}[\boldsymbol{\varepsilon}\boldsymbol{\varepsilon'}\bold{|X}]&=\begin{bmatrix} \mathbb{E}[\varepsilon_1^2|\bold{X}] & \mathbb{E}[\varepsilon_1\varepsilon_2|\bold{X}] & \mathbb{E}[\varepsilon_1\varepsilon_3|\bold{X}]&...&\mathbb{E}[\varepsilon_1\varepsilon_n|\bold{X}]\\ \mathbb{E}[\varepsilon_2\varepsilon_1|\bold{X}] & \mathbb{E}[\varepsilon_2^2|\bold{X}] & \mathbb{E}[\varepsilon_2\varepsilon_3|\bold{X}]&...&\mathbb{E}[\varepsilon_2\varepsilon_n|\bold{X}]\\ \mathbb{E}[\varepsilon_3\varepsilon_1|\bold{X}] & \mathbb{E}[\varepsilon_3\varepsilon_2|\bold{X}] & \mathbb{E}[\varepsilon_3^2|\bold{X}]&...&\mathbb{E}[\varepsilon_3\varepsilon_n|\bold{X}]\\ .&.&.&...&.\\ .&.&.&...&.\\ .&.&.&...&.\\ \mathbb{E}[\varepsilon_n\varepsilon_1|\bold{X}] & \mathbb{E}[\varepsilon_n\varepsilon_2|\bold{X}] & \mathbb{E}[\varepsilon_n\varepsilon_3|\bold{X}]&...&\mathbb{E}[\varepsilon_n^2|\bold{X}]\\ \end{bmatrix}_{(n \times n)}.\\ \end{align*}

Under the assumption of Homoscedasticity (E[εi2X]=σ2)(\mathbb{E}[\varepsilon_i^2\bold{|X}]=\sigma^2) and non-autocorrelation (Eij[εiεjX]=0)(\mathbb{E}_{i\neq j}[\varepsilon_i\varepsilon_j\bold{|X}]=0)

E[εεX]=σ2I,    Var(bX)=Aσ2IA,=(XX)1Xσ2[(XX)1X],=σ2(XX)1XX=1[(XX)1],=σ2[(XX)1],=σ2[(XX)]1,=σ2(XX)1.\begin{align*} \mathbb{E}[\boldsymbol{\varepsilon}\boldsymbol{\varepsilon'}\bold{|X}]&=\sigma^2 \bold{I},\\ \implies \mathbb{Var}(\bold{b|X})&=\bold{A}\sigma^2 \bold{I}\bold{A'},\\ &=\bold{(X'X)}^{-1}\bold{X'} \sigma^2 [\bold{(X'X)}^{-1}\bold{X']'},\\ &=\sigma^2\underbrace{\bold{(X'X)}^{-1}\bold{X'X}}_{=1}[\bold{(X'X)}^{-1}]\bold{'},\\ &=\sigma^2[\bold{(X'X)}^{-1}]\bold{'},\\ &=\sigma^2[\bold{(X'X)'}]^{-1},\\ &=\sigma^2\bold{(X'X)}^{-1}.\\ \end{align*}

We still cannot compute Var(bX)\mathbb{Var}(\bold{b|X}) because σ\sigma is a population parameter and we have to estimate it.

Estimating σ2\sigma^2

We know that

E[εi2]=σ2.\mathbb{E}[\varepsilon_i^2]=\sigma^2.

The sample counterpart of εi\varepsilon_i is eie_i, defined as ei=yixibe_i=y_i-x_i'\bold{b}. An intuitive option for the estimator of σ2\sigma^2 could be 1ni=1nei2\frac{1}{n}\sum_{i=1}^n e_i^2. It is essential, however, to check whether this estimator is unbiased or not.
To check, if

E[1ni=1nei2]=σ2.\begin{align*} \mathbb{E}\Bigg[\frac{1}{n}\sum_{i=1}^n e_i^2\Bigg] & =\sigma^2 .\\ \end{align*}

We know that

E[1ni=1nei2]=E[1nee]=1nE[ee],\begin{align*} \mathbb{E}\Bigg[\frac{1}{n}\sum_{i=1}^n e_i^2\Bigg]&=\mathbb{E}\Bigg[\frac{1}{n}\bold{e'e}\Bigg]=\frac{1}{n}\mathbb{E}[\bold{e'e}], \tag{1}\\ \end{align*}

e\bold{e} can be written as e=My=M[Xβ+ε]=Mε\bold{e=My=M[X}\boldsymbol{\beta + \varepsilon]}=\bold{M}\boldsymbol{\varepsilon} as MX=0\bold{MX}=0, where M=InX(XX)1X\bold{M}=\bold{I_n-X(X'X)^{-1}X'}. Therefore

E[ee]=E[(Mε)(Mε)],=E[εMMε],\begin{align*} \mathbb{E}[\bold{e'e}]&=\mathbb{E}[\bold{(M\boldsymbol{\varepsilon})'(M\boldsymbol{\varepsilon})}],\\ &=\mathbb{E}[\bold{\boldsymbol{\varepsilon'}M'M\boldsymbol{\varepsilon}}], \end{align*}

M\bold{M} is symmetric (M=M)(\bold{M=M'}) and idempotent (M=M2)(\bold{M=M^2}), hence MM=M2=M.\bold{M'M=M^2=M}. Therefore

E[ee]=E[εMMε],=E[εMε],\begin{align*} \mathbb{E}[\bold{e'e}]&=\mathbb{E}[\bold{\boldsymbol{\varepsilon'}M'M\boldsymbol{\varepsilon}}],\\ &=\mathbb{E}[\bold{\boldsymbol{\varepsilon'}M\boldsymbol{\varepsilon}}], \end{align*}

dimension of matrices ee\bold{e'e} and εMε\bold{\boldsymbol{\varepsilon'}M\boldsymbol{\varepsilon}} is 1×11\times 1, therefore

E[ee]=E[Tr(εMε)],=E[Tr(Mεε)],\begin{align*} \mathbb{E}[\bold{e'e}]&=\mathbb{E}[\text{Tr}(\bold{\boldsymbol{\varepsilon'}M\boldsymbol{\varepsilon}})],\\ &=\mathbb{E}[\text{Tr}(\bold{M\boldsymbol{\varepsilon\varepsilon'}})],\\ \end{align*}

we know that E[Tr(X)]=Tr(E[X])\mathbb{E}[\text{Tr}(\bold{X})]=\text{Tr}(\mathbb{E}[\bold{X}]) [How?], therefore,

E[ee]=E[Tr(Mεε)]=Tr(E[Mεε]),=Tr(E[Mεε]),E[eeX]=Tr(E[MεεX]),\begin{align*} \mathbb{E}[\bold{e'e}]&=\mathbb{E}[\text{Tr}(\bold{M\boldsymbol{\varepsilon\varepsilon'}})]=\text{Tr}(\mathbb{E}[\bold{M\boldsymbol{\varepsilon\varepsilon'}}]),\\ &=\text{Tr}(\mathbb{E}[\bold{M\boldsymbol{\varepsilon\varepsilon'}}]),\\ \mathbb{E}[\bold{e'e}|\bold{X}]&=\text{Tr}(\mathbb{E}[\bold{M\boldsymbol{\varepsilon\varepsilon'}}|\bold{X}]),\\ \end{align*}

M\bold{M} is a function of X\bold{X}, therefore

E[eeX]=Tr(E[MεεX])=Tr(ME[εεX]=σ2I),=Tr(Mσ2In),=σ2Tr(M),=σ2Tr(InX(XX)1X),=σ2{Tr(In)Tr(X(XX)1AXB)},\begin{align*} \mathbb{E}[\bold{e'e}|\bold{X}]&=\text{Tr}(\mathbb{E}[\bold{M\boldsymbol{\varepsilon\varepsilon'}}|\bold{X}])=\text{Tr}(\bold{M}\underbrace{\mathbb{E}[\boldsymbol{\varepsilon\varepsilon'}|\bold{X}]}_{=\sigma^2\bold{I}}),\\ &=\text{Tr}(\bold{M}\sigma^2\bold{I_n}),\\ &=\sigma^2\text{Tr}(\bold{M}),\\ &=\sigma^2\text{Tr}(\bold{I_n-X(X'X)^{-1}X'}),\\ &=\sigma^2\Big\{\text{Tr}(\bold{I_n})-\text{Tr}\bold{(\underbrace{X(X'X)^{-1}}_{\bold{A}}\underbrace{X'}_{\bold{B}})}\Big\},\\ \end{align*}

We know that Tr(AB)=Tr(BA)\text{Tr}(\bold{AB})=\text{Tr}(\bold{BA}), therefore

E[eeX]=σ2{Tr(In)Tr(X(XX)1X)}=σ2{Tr(In)Tr(XX(XX)1)},=σ2{Tr(In)Tr(IK)},=σ2(nK).\begin{align*} \mathbb{E}[\bold{e'e}|\bold{X}]&=\sigma^2\Big\{\text{Tr}(\bold{I_n})-\text{Tr}\bold{{(X(X'X)^{-1}}X')}\Big\}=\sigma^2\Big\{\text{Tr}(\bold{I_n})-\text{Tr}\bold{{(X'X(X'X)^{-1}})}\Big\},\\ &=\sigma^2\Big\{\text{Tr}(\bold{I_n})-\text{Tr}\bold{(I_K)}\Big\},\\ &=\sigma^2(n-K). \end{align*}

Applying the law of iterated expectations [Here]

EX[E[eeX]]=E[ee]=EX[σ2(nK)]=σ2(nK).\mathbb{E}_{\bold{X}}[\mathbb{E}[\bold{e'e}|\bold{X}]]=\mathbb{E}[\bold{e'e}]=\mathbb{E}_{\bold{X}}[\sigma^2(n-K)]=\sigma^2(n-K).

Rewriting (1)(1) again

E[1ni=1nei2]=1nE[ee]=1nσ2(nK),\begin{align*} \mathbb{E}\Bigg[\frac{1}{n}\sum_{i=1}^n e_i^2\Bigg]&=\frac{1}{n}\mathbb{E}[\bold{e'e}]=\frac{1}{n}\sigma^2(n-K),\\ \end{align*}

we can see that 1ni=1nei2\frac{1}{n}\sum_{i=1}^n e_i^2 is not the unbiased estimator of σ2\sigma^2, but from the above relation we can find the unbiased estimator of σ2\sigma^2 which is the following

1nE[ee]n(nK)=σ2,    E[ee(nK)]=σ2.\begin{align*} \frac{1}{n}\mathbb{E}[\bold{e'e}]\frac{n}{(n-K)}&=\sigma^2,\\ \implies \mathbb{E}\Bigg[\frac{\bold{e'e}}{(n-K)}\Bigg]&=\sigma^2.\\ \end{align*}

Therefore

σ^2=ee(nK).\hat{\sigma}^2=\frac{\bold{e'e}}{(n-K)}.

Hence

Var(bX)=σ2(XX)1=σ^2(XX)1=ee(nK)(XX)1.\begin{align*} \mathbb{Var}(\bold{b|X})&=\sigma^2\bold{(X'X)}^{-1}\\ &=\hat{\sigma}^2\bold{(X'X)}^{-1}\\ &=\frac{\bold{e'e}}{(n-K)}\bold{(X'X)}^{-1}. \hspace{15px}\blacksquare \end{align*}