diff --git a/DESCRIPTION b/DESCRIPTION index 589b2dc..69e7b57 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,5 +46,5 @@ Config/testthat/edition: 3 Encoding: UTF-8 NeedsCompilation: yes Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3.9000 +RoxygenNote: 7.3.1 Config/Needs/website: rmarkdown diff --git a/R/OMLTask.R b/R/OMLTask.R index 4bdd2b6..217a98b 100644 --- a/R/OMLTask.R +++ b/R/OMLTask.R @@ -10,6 +10,8 @@ #' #' @section mlr3 Integration: #' * Obtain a [mlr3::Task] by calling `as_task()`. +#' Note that this only works for classification and regression tasks. +#' For survival tasks, see the *Getting Started* vignette on the package website. #' * Obtain a [mlr3::Resampling] by calling `as_resampling()`. #' #' @references @@ -135,18 +137,21 @@ OMLTask = R6Class("OMLTask", }, #' @field target_names (`character()`)\cr #' Name of the targets, as extracted from the OpenML task description. + #' For survival tasks, a vector with names `"event"` and at least one of `"left"` and `"right"` is returned. target_names = function() { source_data = self$desc$input$source_data + conv_surv = function(source_data) { + target_names = unlist( + source_data[c("target_feature_left", "target_feature_right", "target_feature_event")] + ) + set_names(make.names(target_names), gsub(pattern = "^target_feature_", "", names(target_names))) + } targets = switch(self$desc$task_type, "Supervised Classification" = , - "Supervised Regression" = source_data$target_feature, - # "Survival Analysis" = unlist( - # source_data[c("target_feature_left", "target_feature_right", "target_feature_event")], - # use.names = FALSE - # ), + "Supervised Regression" = make.names(source_data$target_feature), + "Survival Analysis" = conv_surv(source_data), stopf("Unsupported task type '%s'", self$desc$task_type) ) - make.names(targets) }, #' @field feature_names (`character()`)\cr #' Name of the features (without targets of this [OMLTask]). @@ -189,6 +194,8 @@ as_task.OMLTask = function(x, ...) { "Supervised Classification" = TaskClassif, "Supervised Regression" = TaskRegr, # "Survival Analysis" = new_task_surv, + "Survival Analyis" = + stopf("For survival tasks, see the 'Getting Started' vignette on the website."), stopf("Unsupported task type '%s'.", x$desc$task_type) ) task = constructor$new(name, backend, target = target) diff --git a/R/list_oml_tasks.R b/R/list_oml_tasks.R index 809a227..4220441 100644 --- a/R/list_oml_tasks.R +++ b/R/list_oml_tasks.R @@ -2,7 +2,7 @@ #' @param task_id (`integer()`)\cr #' Vector of task ids to restrict to. #' @param type (`character(1)`)\cr -#' The task type, supported values are: `"clasisf"`, `"regr"`, `"surv"` and `"clust"`. +#' The task type, supported values are: `"classif"`, `"regr"`, `"surv"` and `"clust"`. #' @export list_oml_tasks = function(task_id = NULL, data_id = NULL, number_instances = NULL, number_features = NULL, number_classes = NULL, number_missing_values = NULL, tag = NULL, limit = limit_default(), diff --git a/R/publish_task.R b/R/publish_task.R index f163d51..81176a2 100644 --- a/R/publish_task.R +++ b/R/publish_task.R @@ -10,8 +10,10 @@ #' Can either be `"classif"` or `"regr"` or an integer indicating the task type. #' @param estimation_procedure (`integer(1)`)\cr #' The id of the estimation procedure. -#' @param target (`character(1)`)\cr +#' @param target (`character(1)` | named `character()`)\cr #' The target variable (if applicable). +#' For survival tasks, this must be a named vector, containing at least `"event"`, as well as +#' either `"left"` (timestamp), `"right"` (timestamp) or both. #' @template param_api_key #' @template param_test_server #' @@ -30,12 +32,19 @@ publish_task = function(id, type, estimation_procedure, target, api_key = NULL, type = switch(type, regr = 2, classif = 1, + surv = 7, stopf("Invalid type '%s'.", type) ) } else { assert_int(type, lower = 1L) } - assert_character(target, len = 1L) + if (type == 7) { + assert_character(target, min.len = 2L, any.missing = FALSE) + tn = names(target) + assert_true(("event" %in% tn) & ("left" %in% tn || "right" %in% tn)) + } else { + assert_character(target, len = 1L) + } estimation_procedure = assert_int(estimation_procedure) add = function(name, value) { @@ -48,7 +57,17 @@ publish_task = function(id, type, estimation_procedure, target, api_key = NULL, task = xml2::xml_add_child(doc, "oml:task_inputs", "xmlns:oml" = "http://openml.org/openml") xml2::xml_add_child(task, "oml:task_type_id", type) add("source_data", id) - if (!is.null(target)) add("target_feature", target) + if (type == 7) { + add("target_feature_event", target["event"]) + if ("left" %in% names(target)) { + add("target_feature_left", target["left"]) + } + if ("right" %in% names(target)) { + add("target_feature_right", target["right"]) + } + } else { + if (!is.null(target)) add("target_feature", target) + } add("estimation_procedure", estimation_procedure) withr::defer(unlink(desc_path)) diff --git a/man/list_oml.Rd b/man/list_oml.Rd index cd2d45d..d2727fa 100644 --- a/man/list_oml.Rd +++ b/man/list_oml.Rd @@ -131,7 +131,7 @@ Filter for flow id.} Vector of setup ids to restrict to.} \item{type}{(\code{character(1)})\cr -The task type, supported values are: \code{"clasisf"}, \code{"regr"}, \code{"surv"} and \code{"clust"}.} +The task type, supported values are: \code{"classif"}, \code{"regr"}, \code{"surv"} and \code{"clust"}.} } \value{ (\code{data.table()}) of results, or a null data.table if no data set matches the filter criteria. diff --git a/man/oml_task.Rd b/man/oml_task.Rd index c6481b8..5cd81f6 100644 --- a/man/oml_task.Rd +++ b/man/oml_task.Rd @@ -14,6 +14,8 @@ This object can also be constructed using the sugar function \code{\link[=otsk]{ \itemize{ \item Obtain a \link[mlr3:Task]{mlr3::Task} by calling \code{as_task()}. +Note that this only works for classification and regression tasks. +For survival tasks, see the \emph{Getting Started} vignette on the package website. \item Obtain a \link[mlr3:Resampling]{mlr3::Resampling} by calling \code{as_resampling()}. } } @@ -71,7 +73,8 @@ Number of rows, extracted from the \link{OMLData} object.} Number of columns, as extracted from the \link{OMLData} object.} \item{\code{target_names}}{(\code{character()})\cr -Name of the targets, as extracted from the OpenML task description.} +Name of the targets, as extracted from the OpenML task description. +For survival tasks, a vector with names \code{"event"} and at least one of \code{"left"} and \code{"right"} is returned.} \item{\code{feature_names}}{(\code{character()})\cr Name of the features (without targets of this \link{OMLTask}).} diff --git a/man/publish_task.Rd b/man/publish_task.Rd index 7c88079..282f45f 100644 --- a/man/publish_task.Rd +++ b/man/publish_task.Rd @@ -23,8 +23,10 @@ Can either be \code{"classif"} or \code{"regr"} or an integer indicating the tas \item{estimation_procedure}{(\code{integer(1)})\cr The id of the estimation procedure.} -\item{target}{(\code{character(1)})\cr -The target variable (if applicable).} +\item{target}{(\code{character(1)} | named \code{character()})\cr +The target variable (if applicable). +For survival tasks, this must be a named vector, containing at least \code{"event"}, as well as +either \code{"left"} (timestamp), \code{"right"} (timestamp) or both.} \item{api_key}{(\code{character(1)}) The API key to perform the action, if left NULL it first tries the "mlr3oml.api_key" R option and diff --git a/tests/testthat/test_OMLRun.R b/tests/testthat/test_OMLRun.R index 6e243fc..69ac6bb 100644 --- a/tests/testthat/test_OMLRun.R +++ b/tests/testthat/test_OMLRun.R @@ -5,7 +5,7 @@ skip_on_cran() # expect_oml_run works # # When the test server works: -# 1. Create run for clasisf, regr and surv (for different predict_types) +# 1. Create run for classif, regr and surv (for different predict_types) # 2. Publish the run. # 3. Download the run and convert it. # diff --git a/tests/testthat/test_publish_task.R b/tests/testthat/test_publish_task.R index 4ae78ee..b2f1488 100644 --- a/tests/testthat/test_publish_task.R +++ b/tests/testthat/test_publish_task.R @@ -12,7 +12,7 @@ test_that("Can publish task on test server", { } expect_message({task_id <<- f()}, regexp = NA) # nolint - Sys.sleep(5) + Sys.sleep(10) expect_message(task_id2 <<- f(), "already exists") expect_equal(task_id, task_id2) @@ -24,3 +24,19 @@ test_that("Can publish task on test server", { expect_equal(otask$target_names, "Species") expect_equal(otask$task_type, "Supervised Classification") }) + +test_that("survival", { + test_server = TRUE + withr::defer(delete(type = "data", id = data_id, test_server = test_server)) + withr::defer(delete(type = "task", id = task_id, test_server = test_server)) + data = data.frame(status = sample(c(1, 2), size = 100, replace = TRUE), time = runif(100), x = rnorm(100)) + data_id = publish_data(data, name = "test_surv", desc = "test rats", test_server = test_server) + expect_integer(data_id) + Sys.sleep(10) + task_id = publish_task(id = data_id, type = "surv", estimation_procedure = 19, + target = c(event = "status", right = "time"), test_server = test_server) + + Sys.sleep(5) + otask = otsk(task_id, test_server = test_server) + expect_equal(otask$task_type, "Survival Analysis") +}) diff --git a/vignettes/articles/tutorial.Rmd b/vignettes/articles/tutorial.Rmd index a25721d..3409c1e 100644 --- a/vignettes/articles/tutorial.Rmd +++ b/vignettes/articles/tutorial.Rmd @@ -161,6 +161,40 @@ tsk("oml", task_id = 261) rsmp("oml", task_id = 261) ``` +For technical reasons, it is currently not possible to directly convert OpenML survival tasks to {mlr3proba} survival tasks. +Below, we create an example survival task: + +```{r} +otask_surv = otsk(168753) +otask_surv +otask_surv$target_names +``` + +Here, `$target_names` always includes a value with name `"event"`, and at least one of `"left"` (timestamp) and `"right"` (timestamp). +In order to manually create an `mlr3proba::TaskSurv`, we use `mlr3proba::as_task_surv` and pass the data, task id and other metadata to the function. + +If: +* only the `"left"` timestamp is present, pass it as argument `time` and set the type to `"left"` +* only the `"right"` timestamp is present, pass it as argument `time` and set the type to `"right"` +* both `"left"` and `"right"` are present, pass the `"left"` timestamp as `time`, the `"right"` timestamp as `time2` and set `type` to `"interval"`. + +```{r, eval = FALSE} +mlr3_task = mlr3proba::as_task_surv(otask_surv$data$data, + id = otask_surv$id, + status = otask_surv$target_names["event"], + time = otask_surv$target_names["left"], + type = "left" +) +mlr3_task$feature_names = otask_surv$feature_names +mlr3_task +``` +```{r, eval = FALSE} +## (432 x 9) +## * Target: week, arrest +## * Properties: - +## * Features (7): +## - int (7): age, fin, mar, paro, prio, race, wexp +``` ### Flows and Runs