Skip to content

Commit 2111233

Browse files
committed
Adding Automatic Preprocessing (Imputation/sampling/Class balancing/PCA/standardization/zero variance removal/ids removal)
1 parent ffe7309 commit 2111233

File tree

13 files changed

+256
-191
lines changed

13 files changed

+256
-191
lines changed

DESCRIPTION

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ Imports:
1515
R.utils,
1616
stats,
1717
httr,
18+
UBL,
19+
imputeMissings,
1820
mice,
1921
RCurl,
2022
tictoc,

NAMESPACE

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,11 @@ importFrom(RMySQL,dbSendQuery)
2323
importFrom(RMySQL,fetch)
2424
importFrom(RWeka,J48)
2525
importFrom(RWeka,LMT)
26+
importFrom(UBL,SmoteClassif)
27+
importFrom(caret,confusionMatrix)
2628
importFrom(caret,plsda)
2729
importFrom(deepboost,deepboost)
30+
importFrom(deepboost,deepboost.predict)
2831
importFrom(e1071,kurtosis)
2932
importFrom(e1071,naiveBayes)
3033
importFrom(e1071,skewness)
@@ -35,6 +38,8 @@ importFrom(httr,content)
3538
importFrom(iml,FeatureImp)
3639
importFrom(iml,Interaction)
3740
importFrom(iml,Predictor)
41+
importFrom(imputeMissings,compute)
42+
importFrom(imputeMissings,impute)
3843
importFrom(ipred,bagging)
3944
importFrom(klaR,rda)
4045
importFrom(mda,bruto)

R/autoRLearn.R

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,8 @@
66
#' @param directory String Character of the training dataset directory (SmartML accepts file formats arff/(csv with columns headers) ).
77
#' @param testDirectory String Character of the testing dataset directory (SmartML accepts file formats arff/(csv with columns headers) ).
88
#' @param classCol String Character of the name of the class label column in the dataset (default = 'class').
9-
#' @param selectedFeats Vector of numeric of features columns to include from the training set and ignore the rest of columns - In case of empty vector, this means to include all features in the dataset file (default = c()).
109
#' @param vRatio Float numeric of the validation set ratio that should be splitted out of the training set for the evaluation process (default = 0.1 --> 10\%).
11-
#' @param preProcessF String Character containing the name of the preprocessing algorithm (default = 'N' --> no preprocessing):
10+
#' @param preProcessF vector of string Character containing the name of the preprocessing algorithms (default = c('standardize', 'zv') --> no preprocessing):
1211
#' \itemize{
1312
#' \item "boxcox" - apply a Box–Cox transform and values must be non-zero and positive in all features,
1413
#' \item "yeo-Johnson" - apply a Yeo-Johnson transform, like a BoxCox, but values can be negative,
@@ -26,7 +25,8 @@
2625
#' @param option Integer numeric representing either Classifier Algorithm Selection is needed only = 1 or Algorithm selection with its parameter tuning is required = 2 which is the default value.
2726
#' @param featureTypes Vector of either 'numerical' or 'categorical' representing the types of features in the dataset (default = c() --> any factor or character features will be considered as categorical otherwise numerical).
2827
#' @param interp Boolean representing if model interpretability (Feature Importance and Interaction) is needed or not (default = FALSE) This option will take more time budget if set to 1.
29-
#' @param missingOpr Boolean variable represents either delete instances with missing values or apply imputation using "MICE" library which helps you imputing missing values with plausible data values that are drawn from a distribution specifically designed for each missing datapoint- (default = FALSE to delete instances).
28+
#' @param missingOpr Boolean variable represents either use median/mode imputation for instances with missing values (FALSE) or apply imputation using "MICE" library which helps you imputing missing values with plausible data values that are drawn from a distribution specifically designed for each missing datapoint (TRUE).
29+
#' @param balance Boolean variable represents if SMOTE class balancing is required or not (default FALSE).
3030
#' @param metric Metric of string character to be used in evaluation:
3131
#' \itemize{
3232
#' \item "acc" - Accuracy,
@@ -59,12 +59,14 @@
5959
#'
6060
#' @export autoRLearn
6161

62-
autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', metric = 'acc', selectedFeats = c(), vRatio = 0.3, preProcessF = 'N', featuresToPreProcess = c(), nComp = NA, nModels = 5, option = 2, featureTypes = c(), interp = FALSE, missingOpr = FALSE) {
62+
autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', metric = 'acc', vRatio = 0.3, preProcessF = c('standardize', 'zv'), featuresToPreProcess = c(), nComp = NA, nModels = 5, option = 2, featureTypes = c(), interp = FALSE, missingOpr = FALSE, balance = FALSE) {
63+
#Set Seed
64+
set.seed(22)
6365
#Read Dataset
6466
datasetReadError <- try(
6567
{
6668
#Read Training Dataset
67-
dataset <- readDataset(directory, testDirectory, selectedFeats = selectedFeats, classCol = classCol, vRatio = vRatio, preProcessF = preProcessF, featuresToPreProcess = featuresToPreProcess, nComp = nComp, missingOpr = missingOpr)
69+
dataset <- readDataset(directory, testDirectory, classCol = classCol, vRatio = vRatio, preProcessF = preProcessF, featuresToPreProcess = featuresToPreProcess, nComp = nComp, missingOpr = missingOpr, metric = metric, balance = balance)
6870
trainingSet <- dataset$TD
6971
#Read Testing Dataset
7072
testDataset <- dataset$TED
@@ -159,6 +161,7 @@ autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', me
159161
classifierFailureCounter <- 0
160162

161163
repeat{
164+
gc()
162165
#Fit Model
163166
output <- fitModel(bestParams, bestPerf, trainingSet, validationSet, foldedSet, classifierAlgorithm, tree, B = B)
164167
#Check if this classifer failed for more than 5 times, skip to the next classifier
@@ -177,6 +180,7 @@ autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', me
177180
bestParams <- output$params
178181
bestPerf <- output$perf
179182
timeTillNow <- output$timeTillNow
183+
classifierFailureCounter <- classifierFailureCounter + output$fails
180184
R <- output$r
181185
}
182186
#Check if execution time exceeded the allowed time or not
@@ -188,6 +192,7 @@ autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', me
188192
bestAlgorithmPerf <- bestPerf
189193
bestAlgorithm <- classifierAlgorithm
190194
bestAlgorithmParams <- bestParams
195+
cat('Best Classifier:', bestAlgorithm, ' --> Performance:', bestAlgorithmPerf, '\n')
191196
}
192197
break
193198
}

R/convertCategorical.R

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
convertCategorical <- function(dataset, trainDataset, testDataset, B = 10) {
2222
#Convert Factor/String Features into numeric features
23-
dmy <- caret::dummyVars(" ~ .", data = trainDataset[,names(trainDataset) != "class"])
23+
dmy <- caret::dummyVars(" ~ .", data = rbind(trainDataset, testDataset)[,names(trainDataset) != "class"])
2424
datasetTmp <- data.frame(predict(dmy, newdata = dataset$TD))
2525
dataset$FULLTD <- data.frame(predict(dmy, newdata = trainDataset))
2626
dataset$TED <- data.frame(predict(dmy, newdata = testDataset))
@@ -31,7 +31,6 @@ convertCategorical <- function(dataset, trainDataset, testDataset, B = 10) {
3131
dataset$TED$class <- testDataset$class
3232

3333
if(nrow(dataset$VD) > 1){
34-
#dmy <- dummyVars(" ~ .", data = dataset$VD[,names(dataset$VD) != "class"])
3534
validationSet <- data.frame(predict(dmy, newdata = dataset$VD))
3635
validationSet$class <- dataset$VD$class
3736
dataset$VD <- validationSet

R/evaluateMet.R

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#' \item "precision" - Micro-Average of Precision of each label.
1616
#' }
1717
#'
18+
#' @importFrom caret confusionMatrix
19+
#'
1820
#' @return Float number representing the evaluation.
1921
#'
2022
#' @examples
@@ -27,21 +29,21 @@
2729
#' @keywords internal
2830
#'
2931
evaluateMet <- function(yTrue, pred, metric = 'acc'){
30-
cm = as.matrix(table(Actual = yTrue, Predicted = pred)) # create the confusion matrix
32+
lvls <- union(pred, yTrue)
33+
cm = as.matrix(table(Actual = factor(yTrue, lvls),
34+
Predicted = factor(pred, lvls)) ) # create the confusion matrix
3135
n = sum(cm) # number of instances
3236
nc = nrow(cm) # number of classes
3337
diag = diag(cm) # number of correctly classified instances per class
3438
rowsums = apply(cm, 1, sum) # number of instances per class
3539
colsums = apply(cm, 2, sum) # number of predictions per class
36-
3740
oneVsAll = lapply(1 : nc,
3841
function(i){
3942
v = c(cm[i,i],
4043
rowsums[i] - cm[i,i],
4144
colsums[i] - cm[i,i],
4245
n-rowsums[i] - colsums[i] + cm[i,i]);
4346
return(matrix(v, nrow = 2, byrow = T))})
44-
4547
s = matrix(0, nrow = 2, ncol = 2)
4648
for(i in 1 : nc){s = s + oneVsAll[[i]]}
4749

@@ -64,7 +66,7 @@ evaluateMet <- function(yTrue, pred, metric = 'acc'){
6466
}
6567
else{
6668
perf <- (diag(s) / apply(s,1, sum))[1];
67-
6869
}
70+
6971
return(perf)
7072
}

R/featurePreProcessing.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ featurePreProcessing <- function(data, dataTED, preProcessF, nComp) {
4040
}
4141
else if(preProcessF == 'pca'){
4242
if (is.na(nComp))
43-
preprocessParams <- preProcess(data, method=c("center", "scale", "pca"))
43+
preprocessParams <- preProcess(data, method=c("pca"))
4444
else
4545
preprocessParams <- preProcess(data, method=c("center", "scale", "pca"), pcaComp = nComp)
4646
}

R/intensify.R

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,9 +44,13 @@ intensify <- function(R, bestParams, bestPerf, candidateConfs, foldedSet, traini
4444
forMe <- 0
4545
#number of folds with lower performance for candidate configuration
4646
againstMe <- 0
47+
fails <- 0
4748
while(pointer < B){
4849
for(i in pointer:min(pointer+N-1, B)){
4950
tmpPerf <- runClassifier(trainingSet[foldedSet[[i]], ], validationSet, cntParams, classifierAlgorithm, metric = metric)
51+
if(tmpPerf$perf == 0){
52+
fails <- fails + 1
53+
}
5054
cntPerf <- c(cntPerf, tmpPerf$perf)
5155
if(i > length(bestPerf))
5256
tmpPerf <- runClassifier(trainingSet[foldedSet[[i]], ], validationSet, bestParams, classifierAlgorithm, metric = metric)
@@ -58,7 +62,7 @@ intensify <- function(R, bestParams, bestPerf, candidateConfs, foldedSet, traini
5862
t <- toc(quiet = TRUE)
5963
timeTillNow <- timeTillNow + t$toc - t$tic
6064
tic(quiet = TRUE)
61-
if(timeTillNow > maxTime){
65+
if(timeTillNow > maxTime || fails > 2){
6266
timeFlag <- TRUE
6367
break
6468
}
@@ -76,5 +80,5 @@ intensify <- function(R, bestParams, bestPerf, candidateConfs, foldedSet, traini
7680
bestParams$performance <- mean(bestPerf)
7781
R <- rbind(R, cntParams)
7882
}
79-
return(list(params = bestParams, perf = bestPerf, r = R, timeTillNow = timeTillNow))
83+
return(list(params = bestParams, perf = bestPerf, r = R, timeTillNow = timeTillNow, fails = fails))
8084
}

R/readDataset.R

Lines changed: 76 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,22 @@
55
#' @param directory String of the directory to the file containing the training dataset.
66
#' @param testDirectory String of the directory to the file containing the testing dataset.
77
#' @param vRatio The split ratio of the dataset file into training, and validation sets default(10% Validation - 90% Training).
8-
#' @param selectedFeats Vector of numbers of features to select from the dataset and ignore the rest of columns - empty vector means all features.
98
#' @param classCol String of the class column of the dataset.
10-
#' @param preProcessF String of the preprocessing algorithm to apply.
9+
#' @param preProcessF Vector of Strings of the preprocessing algorithm to apply.
1110
#' @param featuresToPreProcess Vector of numbers of features columns to perform preprocessing - empty vector means all features.
1211
#' @param nComp Number of components needed if either "pca" or "ica" feature preprocessors are needed.
1312
#' @param missingOpr Boolean variable represents either delete instances with missing values or apply imputation using "MICE" library - (default = 0 --> delete instances).
13+
#' @param metric Metric of string character to be used in evaluation:
14+
#' @param balance Boolean variable represents if SMOTE class balancing is required or not (default FALSE).
15+
#' \itemize{
16+
#' \item "acc" - Accuracy,
17+
#' \item "avg-fscore" - Average of F-Score of each label,
18+
#' \item "avg-recall" - Average of Recall of each label,
19+
#' \item "avg-precision" - Average of Precision of each label,
20+
#' \item "fscore" - Micro-Average of F-Score of each label,
21+
#' \item "recall" - Micro-Average of Recall of each label,
22+
#' \item "precision" - Micro-Average of Precision of each label.
23+
#' }
1424
#'
1525
#' @return List of the Training and Validation Sets splits.
1626
#'
@@ -20,35 +30,44 @@
2030
#' @import farff
2131
#' @import caret
2232
#' @import mice
33+
#' @importFrom UBL SmoteClassif
34+
#' @importFrom imputeMissings compute impute
2335
#' @importFrom utils read.csv
2436
#' @importFrom stats complete.cases
2537
#'
2638
#' @noRd
2739
#'
2840
#' @keywords internal
2941

30-
readDataset <- function(directory, testDirectory, vRatio = 0.3, selectedFeats, classCol, preProcessF, featuresToPreProcess, nComp, missingOpr) {
42+
readDataset <- function(directory, testDirectory, vRatio = 0.3, classCol, preProcessF, featuresToPreProcess, nComp, missingOpr, metric, balance) {
3143
#check if CSV or arff
3244
ext <- substr(directory, nchar(directory)-2, nchar(directory))
3345
#Read CSV file of data
3446
if(ext == 'csv'){
3547
con <- file(directory, "r")
36-
data <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = FALSE)
48+
data <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = TRUE)
3749
close(con)
3850
con <- file(testDirectory, "r")
39-
dataTED <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = FALSE)
51+
dataTED <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = TRUE)
4052
close(con)
4153
}
4254
else{
4355
data <- readARFF(directory)
4456
dataTED <- readARFF(testDirectory)
4557
}
4658

59+
#Sampling from large datasets
60+
maxSample = 1000000
61+
n = as.integer(maxSample / ncol(data))
62+
if(maxSample < nrow(data) * ncol(data)){
63+
sampleInds <- createDataPartition(y = data$class, times = 1, p = n/nrow(data), list = FALSE)
64+
data <- data[sampleInds,]
65+
}
66+
4767
#change column name of classes to be "class"
4868
colnames(data)[which(names(data) == classCol)] <- "class"
4969
colnames(dataTED)[which(names(dataTED) == classCol)] <- "class"
5070
cInd <- grep("class", colnames(data)) #index of class column
51-
5271
#function which returns function which will encode vectors with values of class column labels
5372
label_encoder <- function(vec){
5473
levels <- sort(unique(vec))
@@ -60,46 +79,78 @@ readDataset <- function(directory, testDirectory, vRatio = 0.3, selectedFeats, c
6079
data$class <- classEncoder(data$class) # encoding class labels of training set
6180
dataTED$class <- classEncoder(dataTED$class) # encoding class labels of testing set
6281

63-
#check either to delete instance with missing values or perform imputation
82+
#check either to delete an instance with missing values or perform imputation
6483
if (missingOpr == FALSE){
65-
data <- data[complete.cases(data), ]
66-
dataTED <- dataTED[complete.cases(dataTED), ]
84+
missingVals <- imputeMissings::compute(data, method = "median/mode")
85+
data <- impute(data, object = missingVals)
86+
dataTED <- impute(dataTED, object = missingVals)
6787
}
6888
else{
6989
data <-complete( mice(data, m = 1, threshold = 1, printFlag = FALSE))
7090
dataTED <- complete(mice(dataTED, m = 1, threshold = 1, printFlag = FALSE))
7191
}
7292

73-
#select features only upon user request
74-
if(length(selectedFeats) == 0){
75-
selectedFeats <- c(1:ncol(data))
93+
#remove ID features
94+
numericFlag <- unlist(lapply(data, is.numeric))
95+
rmvFlag = c()
96+
for(i in 1:ncol(data)){
97+
len = length(unique(data[,i]))
98+
if(numericFlag[i] == FALSE && ((len / nrow(data) > 0.5) || len == 1) )
99+
rmvFlag <- c(rmvFlag, i)
76100
}
77-
#perform preprocessing
78-
if(preProcessF != 'N'){
79-
if(length(featuresToPreProcess ) == 0)
80-
featuresToPreProcess <- selectedFeats
101+
keepFlag = c(1:ncol(data))
102+
keepFlag = keepFlag[!keepFlag %in% rmvFlag]
103+
data <- data[, keepFlag]
104+
dataTED <- dataTED[, keepFlag]
81105

106+
#Select all remaining features
107+
selectedFeats <- c(1:ncol(data))
108+
109+
#perform preprocessing
110+
if(length(featuresToPreProcess ) == 0){
111+
numericFlag <- unlist(lapply(data, is.numeric))
112+
for(i in 1:ncol(data)){
113+
if(numericFlag[i] == TRUE && i != cInd)
114+
featuresToPreProcess <- c(featuresToPreProcess, i)
115+
}
116+
}
117+
if(length(preProcessF) != 0 && length(featuresToPreProcess) > 1){
82118
featuresToPreProcess <- featuresToPreProcess[!featuresToPreProcess %in% cInd] #remove class column from set of features to be preprocessed
83-
dataTmp <- featurePreProcessing(data[,featuresToPreProcess], dataTED[,featuresToPreProcess], preProcessF, nComp)
119+
dataTmp = list(TD = data[,featuresToPreProcess], TED = dataTED[,featuresToPreProcess])
120+
#Add PCA if we have more than 100 features
121+
if(length(featuresToPreProcess) > 100 && any('pca' != preProcessF) )
122+
preProcessF <- c(preProcessF, 'pca')
123+
for(i in 1:length(preProcessF)){
124+
dataTmp <- featurePreProcessing(dataTmp$TD, dataTmp$TED, preProcessF[i], nComp)
125+
}
84126

85127
#add other features that don't require feature preprocessing to the features obtained after preprocessing
86128
diffTmp <- setdiff(selectedFeats, c(cInd, featuresToPreProcess))
87-
dataTDTmp <- cbind(dataTmp$TD, data[, diffTmp])
88-
dataTEDTmp <- cbind(dataTmp$TED, dataTED[, diffTmp])
129+
dHead = c(colnames(dataTmp$TD), colnames(data)[diffTmp])
130+
131+
dataTDTmp <- data.frame(cbind(dataTmp$TD, data[,diffTmp]))
132+
dataTEDTmp <- data.frame(cbind(dataTmp$TED, dataTED[,diffTmp]))
133+
colnames(dataTDTmp) <- dHead
134+
colnames(dataTEDTmp) <- dHead
135+
89136
#add class column to the dataframe of the dataset
90137
dataTDTmp$class <- data$class
91138
dataTEDTmp$class <- dataTED$class
92139
data <- dataTDTmp
93140
dataTED <- dataTEDTmp
94141
}
95-
else{
96-
data <- data[, selectedFeats]
97-
dataTED <- dataTED[, selectedFeats]
142+
143+
#Class Balancing using Smote for metrics other than accuracy and binary class problems
144+
if( balance == TRUE || (metric != 'acc' && length(unique(data$class)) == 2) ){
145+
data$class = factor(data$class)
146+
data <- SmoteClassif(class ~., data, dist = 'HEOM')
98147
}
99-
# Use 90% of the dataset as Training - 10% of the dataset as Validation by default
100-
smp_size <- floor((1-vRatio) * nrow(data))
148+
149+
# Use 70% of the dataset as Training - 30% of the dataset as Validation by default
150+
#smp_size <- floor((1-vRatio) * nrow(data))
101151
# set the seed to make your partition reproducible
102-
train_ind <- sample(seq_len(nrow(data)), size = smp_size)
152+
#train_ind <- sample(seq_len(nrow(data)), size = smp_size)
153+
train_ind <- createDataPartition(y = data$class, times = 1, p = (1-vRatio), list = FALSE)
103154
trainingDataset <- data[train_ind, ]
104155
validationDataset <- data[-train_ind, ]
105156
return (list(TD = trainingDataset, VD = validationDataset, FULLTD = data, TED = dataTED))

0 commit comments

Comments
 (0)