DataSystemsGroupUT
diff --git a/‎DESCRIPTION‎
Lines changed: 2 additions & 0 deletions b/‎DESCRIPTION‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 5 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎R/autoRLearn.R‎
Lines changed: 10 additions & 5 deletions b/‎R/autoRLearn.R‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎R/convertCategorical.R‎
Lines changed: 1 addition & 2 deletions b/‎R/convertCategorical.R‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎R/evaluateMet.R‎
Lines changed: 6 additions & 4 deletions b/‎R/evaluateMet.R‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎R/featurePreProcessing.R‎
Lines changed: 1 addition & 1 deletion b/‎R/featurePreProcessing.R‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/intensify.R‎
Lines changed: 6 additions & 2 deletions b/‎R/intensify.R‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎R/readDataset.R‎
Lines changed: 76 additions & 25 deletions b/‎R/readDataset.R‎
Lines changed: 76 additions & 25 deletions
@@ -15,6 +15,8 @@ Imports:
   R.utils,
   stats,
   httr,
+  UBL,
+  imputeMissings,
   mice,
   RCurl,
   tictoc,
 
@@ -23,8 +23,11 @@ importFrom(RMySQL,dbSendQuery)
 importFrom(RMySQL,fetch)
 importFrom(RWeka,J48)
 importFrom(RWeka,LMT)
+importFrom(UBL,SmoteClassif)
+importFrom(caret,confusionMatrix)
 importFrom(caret,plsda)
 importFrom(deepboost,deepboost)
+importFrom(deepboost,deepboost.predict)
 importFrom(e1071,kurtosis)
 importFrom(e1071,naiveBayes)
 importFrom(e1071,skewness)
@@ -35,6 +38,8 @@ importFrom(httr,content)
 importFrom(iml,FeatureImp)
 importFrom(iml,Interaction)
 importFrom(iml,Predictor)
+importFrom(imputeMissings,compute)
+importFrom(imputeMissings,impute)
 importFrom(ipred,bagging)
 importFrom(klaR,rda)
 importFrom(mda,bruto)
 
@@ -6,9 +6,8 @@
 #' @param directory String Character of the training dataset directory (SmartML accepts file formats arff/(csv with columns headers) ).
 #' @param testDirectory String Character of the testing dataset directory (SmartML accepts file formats arff/(csv with columns headers) ).
 #' @param classCol String Character of the name of the class label column in the dataset (default = 'class').
-#' @param selectedFeats Vector of numeric of features columns to include from the training set and ignore the rest of columns - In case of empty vector, this means to include all features in the dataset file (default = c()).
 #' @param vRatio Float numeric of the validation set ratio that should be splitted out of the training set for the evaluation process (default = 0.1 --> 10\%).
-#' @param preProcessF String Character containing the name of the preprocessing algorithm (default = 'N' --> no preprocessing):
+#' @param preProcessF vector of string Character containing the name of the preprocessing algorithms (default = c('standardize', 'zv') --> no preprocessing):
 #' \itemize{
 #' \item "boxcox" - apply a Box–Cox transform and values must be non-zero and positive in all features,
 #' \item "yeo-Johnson" - apply a Yeo-Johnson transform, like a BoxCox, but values can be negative,
@@ -26,7 +25,8 @@
 #' @param option Integer numeric representing either Classifier Algorithm Selection is needed only = 1 or Algorithm selection with its parameter tuning is required = 2 which is the default value.
 #' @param featureTypes Vector of either 'numerical' or 'categorical' representing the types of features in the dataset (default = c() --> any factor or character features will be considered as categorical otherwise numerical).
 #' @param interp Boolean representing if model interpretability (Feature Importance and Interaction) is needed or not (default = FALSE) This option will take more time budget if set to 1.
-#' @param missingOpr Boolean variable represents either delete instances with missing values or apply imputation using "MICE" library which helps you imputing missing values with plausible data values that are drawn from a distribution specifically designed for each missing datapoint- (default = FALSE to delete instances).
+#' @param missingOpr Boolean variable represents either use median/mode imputation for instances with missing values (FALSE) or apply imputation using "MICE" library which helps you imputing missing values with plausible data values that are drawn from a distribution specifically designed for each missing datapoint (TRUE).
+#' @param balance Boolean variable represents if SMOTE class balancing is required or not (default FALSE).
 #' @param metric Metric of string character to be used in evaluation:
 #' \itemize{
 #' \item "acc" - Accuracy,
@@ -59,12 +59,14 @@
 #'
 #' @export autoRLearn
 
-autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', metric = 'acc', selectedFeats = c(), vRatio = 0.3, preProcessF = 'N', featuresToPreProcess = c(), nComp = NA, nModels = 5, option = 2, featureTypes = c(), interp = FALSE, missingOpr = FALSE) {
+autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', metric = 'acc', vRatio = 0.3, preProcessF = c('standardize', 'zv'), featuresToPreProcess = c(), nComp = NA, nModels = 5, option = 2, featureTypes = c(), interp = FALSE, missingOpr = FALSE, balance = FALSE) {
+  #Set Seed
+  set.seed(22)
   #Read Dataset
   datasetReadError <- try(
   {
     #Read Training Dataset
-    dataset <- readDataset(directory, testDirectory, selectedFeats = selectedFeats, classCol = classCol, vRatio = vRatio, preProcessF = preProcessF, featuresToPreProcess = featuresToPreProcess, nComp = nComp, missingOpr = missingOpr)
+    dataset <- readDataset(directory, testDirectory, classCol = classCol, vRatio = vRatio, preProcessF = preProcessF, featuresToPreProcess = featuresToPreProcess, nComp = nComp, missingOpr = missingOpr, metric = metric, balance = balance)
     trainingSet <- dataset$TD
     #Read Testing Dataset
     testDataset <- dataset$TED
@@ -159,6 +161,7 @@ autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', me
         classifierFailureCounter <- 0
 
         repeat{
+          gc()
           #Fit Model
           output <- fitModel(bestParams, bestPerf, trainingSet, validationSet, foldedSet, classifierAlgorithm, tree, B = B)
           #Check if this classifer failed for more than 5 times, skip to the next classifier
@@ -177,6 +180,7 @@ autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', me
             bestParams <- output$params
             bestPerf <- output$perf
             timeTillNow <- output$timeTillNow
+            classifierFailureCounter <- classifierFailureCounter + output$fails
             R <- output$r
           }
           #Check if execution time exceeded the allowed time or not
@@ -188,6 +192,7 @@ autoRLearn <- function(maxTime, directory, testDirectory, classCol = 'class', me
               bestAlgorithmPerf <- bestPerf
               bestAlgorithm <- classifierAlgorithm
               bestAlgorithmParams <- bestParams
+              cat('Best Classifier:', bestAlgorithm, ' --> Performance:', bestAlgorithmPerf, '\n')
             }
             break
           }
 
@@ -20,7 +20,7 @@
 
 convertCategorical <- function(dataset, trainDataset, testDataset, B = 10) {
   #Convert Factor/String Features into numeric features
-  dmy <- caret::dummyVars(" ~ .", data = trainDataset[,names(trainDataset) != "class"])
+  dmy <- caret::dummyVars(" ~ .", data = rbind(trainDataset, testDataset)[,names(trainDataset) != "class"])
   datasetTmp <- data.frame(predict(dmy, newdata = dataset$TD))
   dataset$FULLTD <- data.frame(predict(dmy, newdata = trainDataset))
   dataset$TED <- data.frame(predict(dmy, newdata = testDataset))
@@ -31,7 +31,6 @@ convertCategorical <- function(dataset, trainDataset, testDataset, B = 10) {
   dataset$TED$class <- testDataset$class
 
   if(nrow(dataset$VD) > 1){
-    #dmy <- dummyVars(" ~ .", data = dataset$VD[,names(dataset$VD) != "class"])
     validationSet <- data.frame(predict(dmy, newdata = dataset$VD))
     validationSet$class <- dataset$VD$class
     dataset$VD <- validationSet
 
@@ -15,6 +15,8 @@
 #' \item "precision" - Micro-Average of Precision of each label.
 #' }
 #'
+#' @importFrom  caret confusionMatrix
+#'
 #' @return Float number representing the evaluation.
 #'
 #' @examples
@@ -27,21 +29,21 @@
 #' @keywords internal
 #'
 evaluateMet <- function(yTrue, pred, metric = 'acc'){
-  cm = as.matrix(table(Actual = yTrue, Predicted = pred)) # create the confusion matrix
+  lvls <- union(pred, yTrue)
+  cm = as.matrix(table(Actual = factor(yTrue, lvls),
+                       Predicted = factor(pred, lvls)) ) # create the confusion matrix
   n = sum(cm) # number of instances
   nc = nrow(cm) # number of classes
   diag = diag(cm) # number of correctly classified instances per class
   rowsums = apply(cm, 1, sum) # number of instances per class
   colsums = apply(cm, 2, sum) # number of predictions per class
-
   oneVsAll = lapply(1 : nc,
                     function(i){
                       v = c(cm[i,i],
                             rowsums[i] - cm[i,i],
                             colsums[i] - cm[i,i],
                             n-rowsums[i] - colsums[i] + cm[i,i]);
                       return(matrix(v, nrow = 2, byrow = T))})
-
   s = matrix(0, nrow = 2, ncol = 2)
   for(i in 1 : nc){s = s + oneVsAll[[i]]}
 
@@ -64,7 +66,7 @@ evaluateMet <- function(yTrue, pred, metric = 'acc'){
   }
   else{
     perf <- (diag(s) / apply(s,1, sum))[1];
-
   }
+
   return(perf)
 }
@@ -40,7 +40,7 @@ featurePreProcessing <- function(data, dataTED, preProcessF, nComp) {
   }
   else if(preProcessF == 'pca'){
     if (is.na(nComp))
-      preprocessParams <- preProcess(data, method=c("center", "scale", "pca"))
+      preprocessParams <- preProcess(data, method=c("pca"))
     else
       preprocessParams <- preProcess(data, method=c("center", "scale", "pca"), pcaComp = nComp)
   }
 
@@ -44,9 +44,13 @@ intensify <- function(R, bestParams, bestPerf, candidateConfs, foldedSet, traini
     forMe <- 0
     #number of folds with lower performance for candidate configuration
     againstMe <- 0
+    fails <- 0
     while(pointer < B){
       for(i in pointer:min(pointer+N-1, B)){
         tmpPerf <- runClassifier(trainingSet[foldedSet[[i]], ], validationSet, cntParams, classifierAlgorithm, metric = metric)
+        if(tmpPerf$perf == 0){
+          fails <- fails + 1
+        }
         cntPerf <- c(cntPerf, tmpPerf$perf)
         if(i > length(bestPerf))
           tmpPerf <- runClassifier(trainingSet[foldedSet[[i]], ], validationSet, bestParams, classifierAlgorithm, metric = metric)
@@ -58,7 +62,7 @@ intensify <- function(R, bestParams, bestPerf, candidateConfs, foldedSet, traini
         t <- toc(quiet = TRUE)
         timeTillNow <- timeTillNow + t$toc - t$tic
         tic(quiet = TRUE)
-        if(timeTillNow > maxTime){
+        if(timeTillNow > maxTime || fails > 2){
           timeFlag <- TRUE
           break
         }
@@ -76,5 +80,5 @@ intensify <- function(R, bestParams, bestPerf, candidateConfs, foldedSet, traini
     bestParams$performance <- mean(bestPerf)
     R <- rbind(R, cntParams)
   }
-  return(list(params = bestParams, perf = bestPerf, r = R, timeTillNow = timeTillNow))
+  return(list(params = bestParams, perf = bestPerf, r = R, timeTillNow = timeTillNow, fails = fails))
 }
@@ -5,12 +5,22 @@
 #' @param directory String of the directory to the file containing the training dataset.
 #' @param testDirectory String of the directory to the file containing the testing dataset.
 #' @param vRatio The split ratio of the dataset file into training, and validation sets default(10% Validation - 90% Training).
-#' @param selectedFeats Vector of numbers of features to select from the dataset and ignore the rest of columns - empty vector means all features.
 #' @param classCol String of the class column of the dataset.
-#' @param preProcessF String of the preprocessing algorithm to apply.
+#' @param preProcessF Vector of Strings of the preprocessing algorithm to apply.
 #' @param featuresToPreProcess Vector of numbers of features columns to perform preprocessing - empty vector means all features.
 #' @param nComp Number of components needed if either "pca" or "ica" feature preprocessors are needed.
 #' @param missingOpr Boolean variable represents either delete instances with missing values or apply imputation using "MICE" library - (default = 0 --> delete instances).
+#' @param metric Metric of string character to be used in evaluation:
+#' @param balance Boolean variable represents if SMOTE class balancing is required or not (default FALSE).
+#' \itemize{
+#' \item "acc" - Accuracy,
+#' \item "avg-fscore" - Average of F-Score of each label,
+#' \item "avg-recall" - Average of Recall of each label,
+#' \item "avg-precision" - Average of Precision of each label,
+#' \item "fscore" - Micro-Average of F-Score of each label,
+#' \item "recall" - Micro-Average of Recall of each label,
+#' \item "precision" - Micro-Average of Precision of each label.
+#' }
 #'
 #' @return List of the Training and Validation Sets splits.
 #'
@@ -20,35 +30,44 @@
 #' @import farff
 #' @import caret
 #' @import mice
+#' @importFrom UBL SmoteClassif
+#' @importFrom imputeMissings compute impute
 #' @importFrom  utils read.csv
 #' @importFrom stats complete.cases
 #'
 #' @noRd
 #'
 #' @keywords internal
 
-readDataset <- function(directory, testDirectory, vRatio = 0.3, selectedFeats, classCol, preProcessF, featuresToPreProcess, nComp, missingOpr) {
+readDataset <- function(directory, testDirectory, vRatio = 0.3, classCol, preProcessF, featuresToPreProcess, nComp, missingOpr, metric, balance) {
   #check if CSV or arff
   ext <- substr(directory, nchar(directory)-2, nchar(directory))
   #Read CSV file of data
   if(ext == 'csv'){
     con <- file(directory, "r")
-    data <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = FALSE)
+    data <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = TRUE)
     close(con)
     con <- file(testDirectory, "r")
-    dataTED <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = FALSE)
+    dataTED <- read.csv(file = con, header = TRUE, sep = ",", stringsAsFactors = TRUE)
     close(con)
   }
   else{
     data <- readARFF(directory)
     dataTED <- readARFF(testDirectory)
   }
 
+  #Sampling from large datasets
+  maxSample = 1000000
+  n = as.integer(maxSample / ncol(data))
+  if(maxSample < nrow(data) * ncol(data)){
+    sampleInds <- createDataPartition(y = data$class, times = 1, p = n/nrow(data), list = FALSE)
+    data <- data[sampleInds,]
+  }
+
   #change column name of classes to be "class"
   colnames(data)[which(names(data) == classCol)] <- "class"
   colnames(dataTED)[which(names(dataTED) == classCol)] <- "class"
   cInd <- grep("class", colnames(data)) #index of class column
-
   #function which returns function which will encode vectors with values of class column labels
   label_encoder <- function(vec){
     levels <- sort(unique(vec))
@@ -60,46 +79,78 @@ readDataset <- function(directory, testDirectory, vRatio = 0.3, selectedFeats, c
   data$class <- classEncoder(data$class) # encoding class labels of training set
   dataTED$class <- classEncoder(dataTED$class) # encoding class labels of testing set
 
-  #check either to delete instance with missing values or perform imputation
+  #check either to delete an instance with missing values or perform imputation
   if (missingOpr == FALSE){
-    data <- data[complete.cases(data), ]
-    dataTED <- dataTED[complete.cases(dataTED), ]
+    missingVals <- imputeMissings::compute(data, method = "median/mode")
+    data <- impute(data, object = missingVals)
+    dataTED <- impute(dataTED, object = missingVals)
   }
   else{
     data <-complete( mice(data, m = 1, threshold = 1, printFlag = FALSE))
     dataTED <- complete(mice(dataTED, m = 1, threshold = 1, printFlag = FALSE))
   }
 
-  #select features only upon user request
-  if(length(selectedFeats) == 0){
-    selectedFeats <- c(1:ncol(data))
+  #remove ID features
+  numericFlag <- unlist(lapply(data, is.numeric))
+  rmvFlag = c()
+  for(i in 1:ncol(data)){
+    len = length(unique(data[,i]))
+    if(numericFlag[i] == FALSE && ((len / nrow(data) > 0.5) || len == 1) )
+      rmvFlag <- c(rmvFlag, i)
   }
-  #perform preprocessing
-  if(preProcessF != 'N'){
-    if(length(featuresToPreProcess ) == 0)
-      featuresToPreProcess <- selectedFeats
+  keepFlag = c(1:ncol(data))
+  keepFlag = keepFlag[!keepFlag %in% rmvFlag]
+  data <- data[, keepFlag]
+  dataTED <- dataTED[, keepFlag]
 
+  #Select all remaining features
+  selectedFeats <- c(1:ncol(data))
+
+  #perform preprocessing
+  if(length(featuresToPreProcess ) == 0){
+    numericFlag <- unlist(lapply(data, is.numeric))
+    for(i in 1:ncol(data)){
+      if(numericFlag[i] == TRUE && i != cInd)
+        featuresToPreProcess <- c(featuresToPreProcess, i)
+    }
+  }
+  if(length(preProcessF) != 0 && length(featuresToPreProcess) > 1){
     featuresToPreProcess <- featuresToPreProcess[!featuresToPreProcess %in% cInd] #remove class column from set of features to be preprocessed
-    dataTmp <- featurePreProcessing(data[,featuresToPreProcess], dataTED[,featuresToPreProcess], preProcessF, nComp)
+    dataTmp = list(TD = data[,featuresToPreProcess], TED = dataTED[,featuresToPreProcess])
+    #Add PCA if we have more than 100 features
+    if(length(featuresToPreProcess) > 100 && any('pca' != preProcessF) )
+      preProcessF <- c(preProcessF, 'pca')
+    for(i in 1:length(preProcessF)){
+      dataTmp <- featurePreProcessing(dataTmp$TD, dataTmp$TED, preProcessF[i], nComp)
+    }
 
     #add other features that don't require feature preprocessing to the features obtained after preprocessing
     diffTmp <- setdiff(selectedFeats, c(cInd, featuresToPreProcess))
-    dataTDTmp <- cbind(dataTmp$TD, data[, diffTmp])
-    dataTEDTmp <- cbind(dataTmp$TED, dataTED[, diffTmp])
+    dHead = c(colnames(dataTmp$TD), colnames(data)[diffTmp])
+
+    dataTDTmp <- data.frame(cbind(dataTmp$TD, data[,diffTmp]))
+    dataTEDTmp <- data.frame(cbind(dataTmp$TED, dataTED[,diffTmp]))
+    colnames(dataTDTmp) <- dHead
+    colnames(dataTEDTmp) <- dHead
+
     #add class column to the dataframe of the dataset
     dataTDTmp$class <- data$class
     dataTEDTmp$class <- dataTED$class
     data <- dataTDTmp
     dataTED <- dataTEDTmp
   }
-  else{
-    data <- data[, selectedFeats]
-    dataTED <- dataTED[, selectedFeats]
+
+  #Class Balancing using Smote for metrics other than accuracy and binary class problems
+  if( balance == TRUE || (metric != 'acc' && length(unique(data$class)) == 2) ){
+    data$class = factor(data$class)
+    data <- SmoteClassif(class ~., data, dist = 'HEOM')
   }
-  # Use 90% of the dataset as Training - 10% of the dataset as Validation by default
-  smp_size <- floor((1-vRatio) * nrow(data))
+
+  # Use 70% of the dataset as Training - 30% of the dataset as Validation by default
+  #smp_size <- floor((1-vRatio) * nrow(data))
   # set the seed to make your partition reproducible
-  train_ind <- sample(seq_len(nrow(data)), size = smp_size)
+  #train_ind <- sample(seq_len(nrow(data)), size = smp_size)
+  train_ind <- createDataPartition(y = data$class, times = 1, p = (1-vRatio), list = FALSE)
   trainingDataset <- data[train_ind, ]
   validationDataset <- data[-train_ind, ]
   return (list(TD = trainingDataset, VD = validationDataset, FULLTD = data, TED = dataTED))
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ featurePreProcessing <- function(data, dataTED, preProcessF, nComp) {`
`40`	`40`	`}`
`41`	`41`	`else if(preProcessF == 'pca'){`
`42`	`42`	`if (is.na(nComp))`
`43`		`- preprocessParams <- preProcess(data, method=c("center", "scale", "pca"))`
	`43`	`+ preprocessParams <- preProcess(data, method=c("pca"))`
`44`	`44`	`else`
`45`	`45`	`preprocessParams <- preProcess(data, method=c("center", "scale", "pca"), pcaComp = nComp)`
`46`	`46`	`}`