-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathHomeWork1.Rmd
215 lines (156 loc) · 7.74 KB
/
HomeWork1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
---
title: "HW1"
output: pdf_document
date: "`r Sys.Date()`"
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
## R Markdown
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see <http://rmarkdown.rstudio.com>.
When you click the **Knit** button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
##Load Packages
```{r loadPackages, warning=FALSE, message=FALSE, results='hide'}
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse,reshape,ggplots,ggmap,
mblench,data.table)
search()
theme_set(theme_classic())
```
## Read unitlies data
```{r readData}
#read in Utilities.csv
utilities.df <- fread("Utilities.csv")
str(utilities.df)
#data.table
utilities.dt <- setDT(utilities.df)
str(utilities.dt)
summary(utilities.dt)
```
### 1) min,max,mean,median,std deviation
```{r question1}
class(utilities.dt[,Fuel_Cost])
#get numeric dat table
numeric.utilities.dt <- utilities.dt[, !c("Company")]
#Mean of all columns
numeric.utilities.dt[, lapply(.SD, mean)]
#standard deviation of all varibales
numeric.utilities.dt[, lapply(.SD, sd)]
#min of all varibales
numeric.utilities.dt[, lapply(.SD, min)]
#max of all varibales
numeric.utilities.dt[, lapply(.SD, max)]
#median of all variables
numeric.utilities.dt[, lapply(.SD, median)]
# Compute Statistics - mean, standard dev, min, max, median of Fixed_charge
numeric.utilities.dt[,.(mean=mean(Fixed_charge),sd=sd(Fixed_charge),minimum=min(Fixed_charge)
,maximum=max(Fixed_charge),median=median(Fixed_charge),length=length(Fixed_charge))]
# Compute Statistics - mean, standard dev, min, max, median of RoR
utilities.dt[,.(mean=mean(RoR),sd=sd(RoR),minimum=min(RoR)
,maximum=max(RoR),median=median(RoR),length=length(RoR))]
# Compute Statistics - mean, standard dev, min, max, median of Cost
utilities.dt[,.(mean=mean(Cost),sd=sd(Cost),minimum=min(Cost)
,maximum=max(Cost),median=median(Cost),length=length(Cost))]
# Compute Statistics - mean, standard dev, min, max, median of load_factor
utilities.dt[,.(mean=mean(Load_factor),sd=sd(Load_factor),minimum=min(Load_factor)
,maximum=max(Load_factor),median=median(Load_factor),length=length(Load_factor))]
# Compute Statistics - mean, standard dev, min, max, median of demand_growth
utilities.dt[,.(mean=mean(Demand_growth),sd=sd(Demand_growth),minimum=min(Demand_growth)
,maximum=max(Demand_growth),median=median(Demand_growth),length=length(Demand_growth))]
# Compute Statistics - mean, standard dev, min, max, median of Sales
utilities.dt[,.(mean=mean(Sales),sd=sd(Sales),minimum=min(Sales)
,maximum=max(Sales),median=median(Sales),length=length(Sales))]
# Compute Statistics - mean, standard dev, min, max, median of Nuclear
utilities.dt[,.(mean=mean(Nuclear),sd=sd(Nuclear),minimum=min(Nuclear)
,maximum=max(Nuclear),median=median(Nuclear),length=length(Nuclear))]
# Compute Statistics - mean, standard dev, min, max, median of Fuel_cost
utilities.dt[,.(mean=mean(Fuel_Cost),sd=sd(Fuel_Cost),minimum=min(Fuel_Cost)
,maximum=max(Fuel_Cost),median=median(Fuel_Cost),length=length(Fuel_Cost))]
print("Sales has the largest variability. The values in Sales variable are more spread apart which can be depicted from Satandard Deviation(s=3549.984)")
```
##Boxplots for each numeric varibales
```{r BoxPlots}
#Boxplot for Fixed Charge
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=Fixed_charge), fill="gold1", outlier.colour = "firebrick2")+
ylab("Fixed Charge income/debt") + ggtitle("Fixed Charge Boxplot")
#utilities.dt[, sum(!is.na(Fixed_charge))]
#Boxplot for Rate of return
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=RoR), fill="gold2", outlier.colour = "firebrick2")+
ylab("Rate of Return") + ggtitle("Rate of Return Boxplot")
#Boxplot for Cost per kilowatt
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=Cost), fill="gold3", outlier.colour = "firebrick2")+
ylab("Cost/kilowatt") + ggtitle("Cost Boxplot")
#boxplot for Anual Load Factor
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=Load_factor), fill="gold4", outlier.colour = "firebrick2")+
ylab("Load Factor") + ggtitle("Load Factor Boxplot")
#bocplot for peak kilowatthour demand growth from 1974 to 1975
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=Demand_growth), fill="gold1", outlier.colour = "firebrick2")+
ylab("Demand Growth") + ggtitle("Peak Kilowatt Hour Demand Growth Boxplot")
#boxplot for sales (kilowatthour use per year)
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=Sales), fill="gold2", outlier.colour = "firebrick2")+
ylab("Sales Kilowatthour/year") + ggtitle("Sales Boxplot")
#utilities.dt[, sum(is.na(Sales))]
#boxplot for percent nuclear
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=Nuclear), fill="gold3", outlier.colour = "firebrick2")+
ylab("Percent Nuclear") + ggtitle("Nuclear Boxplot")
#boxplot for total fuel costs (cents per kilowatthour)
ggplot(utilities.dt)+
geom_boxplot(aes(x="",y=Fuel_Cost), fill="gold4", outlier.colour = "firebrick2")+
ylab("Fuel Cost cents/kilowatthour") + ggtitle("Fuel Cost Boxplot")
print("Yes there are extreme values. The Varibale Fixed_charge and Sales has outliers which is highlighted in firebrick color.")
```
#Heatmap for numeric variables
```{r heatmap}
#heatmap(utilities.dt$Cost , Rowv = NULL, Colv = NULL)
heatmap(cor(utilities.dt[,!c("Company")]), Rowv = NA, Colv = NA)
num.utilities.df <- utilities.df[, !c("Company")]
round(cor(num.utilities.df),2)
heatmap.2(cor(num.utilities.df), dendrogram = "none",
cellnote = round(cor(num.utilities.df),2), notecol = "navy",
key = FALSE, trace = "none")
print("Fixed_charge & RoR have positive correlation i.e higher the Fixed-charge higher the Rate of return. Sales & Fuel_Cost have high negative correlation.")
# heatmap using ggplot
# using reshape package to to generate input for the plot
#library(reshape)
#cor.mat <- round(cor(utilities.dt[,!c("Company")]),2) # rounded correlation matrix
#melted.cor.mat <- melt(cor.mat)
#ggplot(data=melted.cor.mat) +
#scale_fill_gradient(low="wheat", high="orangered") +
#geom_tile( aes(x = Cost, y = RoR, fill = value)) +
#geom_text(aes(x = X1,y = X2, label = value)) +
#ggtitle("Which Variables Are Highly Correlated?")
```
### Principal Component Analysis using Unscaled Data
```{r PCA, warning=FALSE}
### compute PCs on two dimensions
pcs <- prcomp(data.frame(utilities.df$Fixed_charge, utilities.df$RoR))
pcs
summary(pcs)
pcs$rot # rotation matrix
scores <- pcs$x
head(scores, 5)
### PCA on 13 variables
pcs13 <- prcomp(na.omit(utilities.df[,-c("Company")]))
pcs13
summary(pcs13)
pcs13$rot
print("PC1 covers maximum proportion of variance i.e. 99%. The firs two componets accounts for 99% of the total variance. Hence using 2 components in the model may be suffiecient.")
```
## Principal Component Analysis after scaling numeric data
```{r PCA}
### PCA using Normalized variables
pcs.cor <- prcomp(na.omit(utilities.df[,-c("Company")]), scale. = T)
summary(pcs.cor)
pcs.cor$rot
print("The weights of the normalised components changed. For example Sales had the higest vakue before normalising and after normalising the value increaded to prositive in 1st component.")
install.packages("tinytex")
tinytex::install_tinytex()
```
Note that the `echo = FALSE` parameter was added to the code chunk to prevent printing of the R code that generated the plot.