-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProblemSet_05_PrincipalComponentAnalysis.R
233 lines (194 loc) · 7.04 KB
/
ProblemSet_05_PrincipalComponentAnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#####################################################################
### ProblemSet 05: Principal Component Analysis
#####################################################################
### Preface
# clearing workspace and set wd
rm(list=ls())
cat("\014")
dev.off()
#library(plm) # Panel Methods - regression models
library(mvtnorm) # random draws from a multivariate normal distr.
#library(MASS) # to fit LDA and QDA analysis
library(glmnet) # Lasso and Ridge regression
# Note: packages above require the data to be saved as a data frame
# data config
N = 300
p = 2
mu = 0
## DGP
# Variance-Covariance Matrix, beta vector
set.seed(123)
temp=abs(rnorm(p, 0, 1^0.5))
covmat=diag(temp)
beta=runif(p, min = 0.1, max = 0.5)
# sampling data and generate y's
X = rmvnorm(N, mean = rep(mu, p), sigma = covmat)
eps = rnorm(N, 0, 1^0.5)
Y = X %*% beta + eps
rm(temp, eps)
#####################################################################
### Exercise 1
## a)
### how to implement the eigenvector calculation for PCA
# 1. calculate cov(x), x is demeaned
# 2. calculate Eigenvectors of cov(x)=phi
# 3. transformed Z=t(phi) times t(x)
###
X_m = (X-colMeans(X))
covx = cov(X_m)
phi = eigen(covx)
ev = phi$vectors
ev1 = phi$vectors[,1] # first eigenvector for PCR-comp 1
ev2 = phi$vectors[,2] # second eigenvector for PCR-comp 2
plot(X[,1],X[,2])
par(new=TRUE)
plot(ev1, type="l", col="red", axes=FALSE, ylab="", xlab="")
par(new=TRUE)
plot(ev2, type="l", col="blue", axes=FALSE, ylab="", xlab="")
# Why are these eigenvectors not orthogonal?
Z = X%*%ev
Z1 = Z[,1]
Z2 = Z[,2]
# PCA package in R
pca = prcomp(X)$rotation
ZT = X%*%pca
# compare eigenvector vs package
plot(Z,ZT, xlab="eigenvector method", ylab="PCR Package")
error = sum(abs(ZT-Z)>0.01)
print(error)
## b)
# draw test data
X_test= rmvnorm(N, mean = rep(mu, p), sigma = covmat)
eps_test= eps=rnorm(N,0,1)
Y_test= X_test %*% beta + eps_test
X_test_d=data.frame(X_test) #predict functions need dataframe, but the one
Y_test_d=data.frame(Y_test) #for ridge does not
# perform PCR on Z1 and then Z1+Z2
PCR1=lm(Y ~ Z1)
PCR1_predict=data.frame(predict(PCR1, newdata = X_test_d))
Test_error_PCR1=(1/N)*sum((Y_test-PCR1_predict)^2)
PCR2=lm(Y ~ Z1 + Z2)
PCR2_predict=data.frame(predict(PCR2, newdata = X_test_d))
Test_error_PCR2=(1/N)*sum((Y_test-PCR2_predict)^2)
#Ridge
library(glmnet)
grid = 10^seq(10,-2, length =100)
ridge.mod = glmnet(X,Y,alpha=0, lambda=grid)
set.seed(1)
cv.out=cv.glmnet(X,Y,alpha=0)
plot(cv.out)
bestlam =cv.out$lambda.min
ridge.pred=predict(ridge.mod ,s=bestlam ,newx=X_test)
Test_error_ridge=(1/N)*sum((ridge.pred - Y_test)^2)
#OLS
OLS=lm(Y ~ X)
OLS_predict=data.frame(predict(OLS, newdata = X_test_d))
Test_error_OLS=(1/N)*sum((Y_test-OLS_predict)^2)
paste("OLS Test-error:", format(Test_error_OLS, digit=4))
paste("PCR one direction Test-error:", format(Test_error_PCR1, digit=4))
paste("PCR both directions / true model Test-error:", format(Test_error_PCR2, digit=4))
paste("Ridge Test-error:", format(Test_error_ridge, digit=4))
paste("Ridge regression performs the best!")
#####################################################################
### Exercise 2 - Discussion and Simulation Study
# first set p=50
# 1. DGP most suited for PCA
# -> zentralized data (standard normal N(0,1) )
# 2. DGP most suited for Ridge
# -> beta is not equal to zero
# 3. DGP most suited for OLS (BLUE - estimation)
# -> linear model with - uncorrelated, homoskedastic epsilons with mean zero
# -> first we need regressors which is driving our DGP with high variances
# -> the higher N the more precise is our estimation
### Simulation Study to check different methods given a certain DGP process
# clear space
# clearing workspace and set wd
rm(list=ls())
cat("\014")
dev.off()
library(mvtnorm) # random draws from a multivariate normal distr.
library(glmnet) # Lasso and Ridge regression
######################################################################
n=100
p=50
mu=0
beta=runif(p, min=0.01, max=0.99)
MCN=100
MSE=matrix(NaN,4,MCN)
# Note: the to check different specifications of the DGP check out
# the "covmat" comments in the MCN Loop
######################################################################
for (i in 1:MCN){
#generate training set
######################################################################
beta=runif(p, min=0.1, max=0.5)
beta[2:p]=0
#a=runif(1,min=-0.0,max=0.0)
#covmat=matrix(c(1,a,a,2),2,2)
temp=runif(p, min = 0.5, max = 1.5)
covmat=diag(temp)
#covmat[1:p,1:p]=1
#covmat=matrix(runif(p*p,min=0.5,max=0.9),p)
#covmat=forceSymmetric(covmat)
#covmat=matrix(covmat@x,p)
#covmat[1,1]=10
######################################################################
eps=rnorm(n,0,1)
X = rmvnorm(n, mean = rep(mu, p), sigma = covmat)
Y = X %*% beta + eps
#generate test set
X_test= rmvnorm(n, mean = rep(mu, p), sigma = covmat)
eps_test= eps=rnorm(n,0,1)
Y_test= X_test %*% beta + eps_test
X_test_d=data.frame(X_test)
Y_test_d=data.frame(Y_test)
#calculate Z using PCA
q=prcomp(X)$rotation
Z=X%*%q
Z1=Z[,1]
Z2=Z[,2]
#do PCR
PCR1=lm(Y ~ Z1)
PCR1_predict=data.frame(predict(PCR1, newdata = X_test_d))
Test_error_PCR1=(1/n)*sum((Y_test-PCR1_predict)^2)
PCR2=lm(Y ~ Z1 + Z2)
PCR2_predict=data.frame(predict(PCR2, newdata = X_test_d))
Test_error_PCR2=(1/n)*sum((Y_test-PCR2_predict)^2)
#Ridge
ridge.mod = glmnet(X,Y,alpha=0)
cv.out=cv.glmnet(X,Y,alpha=0)
bestlam =cv.out$lambda.min
ridge.pred=predict(ridge.mod ,s=bestlam ,newx=X_test)
Test_error_ridge=(1/n)*sum((ridge.pred - Y_test)^2)
#OLS
OLS=lm(Y ~ X)
OLS_predict=data.frame(predict(OLS, newdata = X_test_d))
Test_error_OLS=(1/n)*sum((Y_test-OLS_predict)^2)
#save results for each run
MSE[1,i]=Test_error_OLS
MSE[2,i]=Test_error_PCR1
MSE[3,i]=Test_error_PCR2
MSE[4,i]=Test_error_ridge
}
### calculating Training Errors
MSE_ave=matrix(NaN,4,MCN)
for (l in 1:MCN){
MSE_ave[1,l]=mean(MSE[1,1:(l)])
MSE_ave[2,l]=mean(MSE[2,1:(l)])
MSE_ave[3,l]=mean(MSE[3,1:(l)])
MSE_ave[4,l]=mean(MSE[4,1:(l)])
}
### plotting results from simulation study
plot(MSE_ave[2,],type="l",col="green",ylim=c(min(MSE_ave),max(MSE_ave)),xlim=c(0,100),axis=FALSE,
main="PCR 1, PCR 2, Ridge",ylab="average MSE", xlab="number of runs")
par(new=TRUE)
plot(MSE_ave[3,],type="l",col="blue",ylim=c(min(MSE_ave),max(MSE_ave)),xlim=c(0,100),axis=FALSE,
ylab="average MSE", xlab="number of runs")
par(new=TRUE)
plot(MSE_ave[4,],type="l",col="red",ylim=c(min(MSE_ave),max(MSE_ave)),xlim=c(0,100),axis=FALSE,
ylab="average MSE", xlab="number of runs")
par(new=TRUE)
plot(MSE_ave[1,],type="l",col="black",ylim=c(min(MSE_ave),max(MSE_ave)),xlim=c(0,100),axis=FALSE,
ylab="average MSE", xlab="number of runs")
legend(40, max(MSE_ave), legend=c("PCR 1", "PCR 2","Ridge", "OLS"),
col=c("green", "blue","red","black"), lty=1, cex=0.8)