Skip to content

Feat/new api #134

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions R/OMLTask.R
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ OMLTask = R6Class("OMLTask",
#' @field name (`character(1)`)\cr
#' Name of the task, extracted from the task description.
name = function() {
self$desc$task_name
self$desc$name
},
#' @field task_type (`character(1)`)\cr
#' The OpenML task type.
Expand Down Expand Up @@ -156,7 +156,7 @@ OMLTask = R6Class("OMLTask",
#' @field data_name (`character()`)\cr
#' Name of the dataset (inferred from the task name).
data_name = function() {
strsplit(self$desc$task_name, split = " ")[[1]][[3]]
strsplit(self$desc$name, split = " ")[[1]][[3]]
}
),
private = list(
Expand Down
2 changes: 1 addition & 1 deletion R/download_data_features.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
download_data_features = function(data_id, server, desc = download_desc_data(data_id, server)) {
features = get_json(paste0(server, "/json/data/features/%i"), data_id, server = server)[[1L]][[1L]]
features = get_json(paste0(server, "/datasets/features/%i"), data_id, server = server)

features$index = as.integer(features$index)
features$name = make.names(features$name)
Expand Down
2 changes: 1 addition & 1 deletion R/download_data_qualities.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
download_data_qualities = function(data_id, server) {
qualities = get_json(paste0(server, "/json/data/qualities/%i"), data_id, server = server)[[1L]][[1L]] # nolint
qualities = get_json(paste0(server, "/datasets/qualities/%i"), data_id, server = server)

qualities$value = as.numeric(qualities$value)
setDT(qualities, key = "name")[]
Expand Down
4 changes: 1 addition & 3 deletions R/download_desc_collection.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
download_desc_collection = function(collection_id, server) {
desc = get_json(paste0(server, "/json/study/%i"), collection_id,
simplify_data_frame = FALSE, server = server
)[[1L]]
desc = get_json(paste0(server, "/studies/%i"), collection_id, simplify_data_frame = FALSE, server = server)

parse_desc_collection(desc)
}
Expand Down
26 changes: 19 additions & 7 deletions R/download_desc_data.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
download_desc_data = function(data_id, server) {
desc = get_json(paste0(server, "/json/data/%i"), data_id, simplify_data_frame = FALSE,
server = server)[[1L]]
desc = get_json(paste0(server, "/datasets/%i"), data_id, simplify_data_frame = FALSE, server = server)
parse_desc_data(desc)
}

Expand All @@ -10,16 +9,29 @@ parse_desc_data = function(desc) {
stopf("Unsupported data format: %s", desc$format)
}

desc$id = as.integer(desc$id)
desc$version = as.integer(desc$version)
# default_target_attribute, row_id_attribute and ignore_attribute have reutrn type arrau<string> in the json
# schema. In the empty case, jsonlite returns these as a `list()` so we have to convert them to characters

# When accessing a dataset where make.names() does not return unique names on the features, an error is thrown
# at another part in the code, so we can change the name of the target column here without risk
desc$default_target_attribute = make.names(desc$default_target_attribute)
desc$upload_date = strptime(desc$upload_date, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")
desc$processing_date = strptime(desc$processing_date, format = "%Y-%m-%d %H:%M:%S", tz = "UTC")
desc$row_id_attribute = as.character(desc$row_id_attribute)
# OpenML (sometimes) uploaded the ignore_attributes comma-seperated
ignore_attribute = map(desc$ignore_attribute, function(x) strsplit(x, ",")[[1L]])
desc$ignore_attribute = make.unique(make.names(unlist(ignore_attribute)))
desc$ignore_attribute = make.names(as.character(ignore_attribute))

if (anyDuplicated(desc$ignore_attribute)) {
stopf("No unique names after conversion. This happened because ignore attribute names are not valid R names and had to be converted, which created duplicates.") # nolint
}

desc$upload_date = strptime(desc$upload_date, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")
desc$processing_date = strptime(desc$processing_date, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")

desc$creator = as.character(desc$creator)
desc$contributor = as.character(desc$contributor)
desc$paper_url = as.character(desc$paper_url)
desc$tag = as.character(desc$tag)

return(desc)
}

10 changes: 3 additions & 7 deletions R/download_desc_flow.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,14 @@
download_desc_flow = function(flow_id, server) {
desc = get_json(paste0(server, "/json/flow/%i"), flow_id, server = server)[[1L]]
desc = get_json(paste0(server, "/flows/%i"), flow_id, server = server)
desc = parse_desc_flow(desc)
return(desc)
}

parse_desc_flow = function(desc) {
desc$id = as.integer(desc$id)
desc$upload_date = as.POSIXct(desc$upload_date, format = "%Y-%m-%dT%H:%M:%S", tz = "UTC")
desc$version = as.integer(desc$version)
desc$uploader = as.integer(desc$uploader)
desc$id = as.integer(desc$id)
desc$parameter =

if (is.null(desc$parameter)) {
browser()
if (!length(desc$parameter)) {
desc$parameter = data.table(
name = character(0),
data_type = character(0),
Expand Down
2 changes: 1 addition & 1 deletion R/download_desc_run.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
download_desc_run = function(run_id, server) {
desc = get_json(paste0(server, "/json/run/%i"), run_id, server = server)[[1L]]
desc = get_json(paste0(server, "/runs/%i"), run_id, server = server)
desc = parse_desc_run(desc)
return(desc)
}
Expand Down
11 changes: 5 additions & 6 deletions R/download_desc_task.R
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
download_desc_task = function(task_id, server) {
desc = get_json(paste0(server, "/json/task/%i"), task_id,
desc = get_json(paste0(server, "/tasks/%i"), task_id,
simplify_data_frame = FALSE, server = server
)[[1L]]
)

desc = parse_desc_task(desc)
return(desc)
}

parse_desc_task = function(desc) {
desc$task_id = as.integer(desc$task_id)
desc$task_type_id = as.integer(desc$task_type_id)
desc$input = set_names(map(desc$input, function(x) x[[2L]]), map_chr(desc$input, "name"))
desc$input$source_data$data_set_id = as.integer(desc$input$source_data$data_set_id)
desc$input = set_names(imap(desc$input, function(x, nm) x[[1]]), map_chr(desc$input, "name"))
desc$output = set_names(imap(desc$output, function(x, nm) x[[1]]), map_chr(desc$output, "name"))

est_params = desc$input$estimation_procedure$parameter
ep_names = map(est_params, "name")
ep_values = map(est_params, "value")
Expand Down
26 changes: 19 additions & 7 deletions R/helper.R
Original file line number Diff line number Diff line change
Expand Up @@ -101,17 +101,20 @@ get_json = function(url, ..., simplify_vector = TRUE, simplify_data_frame = TRUE
for (retry in seq_len(retries)) {
response = download_file(url, path, api_key = api_key)

browser()
if (response$ok) {
json = jsonlite::fromJSON(path, simplifyVector = simplify_vector, simplifyDataFrame = simplify_data_frame)
return(json)
} else if (retry < retries) {
if (response$oml_code %in% c(107L)) {
delay = max(rnorm(1L, mean = 10), 0)
lg$debug("Server busy, retrying in %.2f seconds", delay, try = retry)
Sys.sleep(delay)
} else {
break
}
break
# FIXME:
#if (isTRUE(response$oml_code %in% c(107L))) {
# delay = max(rnorm(1L, mean = 10), 0)
# lg$debug("Server busy, retrying in %.2f seconds", delay, try = retry)
# Sys.sleep(delay)
#} else {
# break
#}
}
}

Expand All @@ -134,6 +137,15 @@ get_paginated_table = function(query_type, ..., limit, server) {
query = build_filter_query(query_type, dots, server)

response = get_json(query, error_on_fail = FALSE, server = server)

response = httr::POST(
url = sprintf("%s/task", get_server(test_server)),
body = list(
description = httr::upload_file(desc_path)
),
query = list(api_key = api_key)
)

if (inherits(response, "server_response")) {
if (response$oml_code %in% magic_numbers$oml_no_more_results) {
# no more results
Expand Down
2 changes: 1 addition & 1 deletion R/list_oml_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ list_oml_data = function(data_id = NULL, data_name = NULL, number_instances = NU
number_classes = NULL, number_missing_values = NULL, tag = NULL, limit = limit_default(),
test_server = test_server_default(), ...) {

tab = get_paginated_table("data",
tab = get_paginated_table("datasets",
data_id = data_id,
data_name = data_name,
number_instances = number_instances,
Expand Down
2 changes: 1 addition & 1 deletion R/list_oml_flows.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#' @export
list_oml_flows = function(uploader = NULL, tag = NULL, limit = limit_default(),
test_server = test_server_default(), ...) {
tab = get_paginated_table("flow",
tab = get_paginated_table("flows",
uploader = uploader,
tag = tag,
limit = limit,
Expand Down
2 changes: 1 addition & 1 deletion R/list_oml_measures.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#' @export
list_oml_measures = function(test_server = test_server_default()) {
server = get_server(test_server)
measures = get_json(sprintf("%s/json/evaluationmeasure/list", server),
measures = get_json(sprintf("%s/py/evaluationmeasure/list", server),
server = server)[[c(1L, 1L, 1L)]]
data.table(measure = measures)
}
2 changes: 1 addition & 1 deletion R/list_oml_runs.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#' @export
list_oml_runs = function(run_id = NULL, task_id = NULL, tag = NULL, flow_id = NULL,
limit = limit_default(), test_server = test_server_default(), ...) {
tab = get_paginated_table("run",
tab = get_paginated_table("runs",
run = run_id,
task = task_id,
tag = tag,
Expand Down
2 changes: 1 addition & 1 deletion R/list_oml_setups.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' @export
list_oml_setups = function(flow_id = NULL, setup_id = NULL, tag = NULL, limit = limit_default(),
test_server = test_server_default(), ...) {
tab = get_paginated_table("setup",
tab = get_paginated_table("setups",
flow = flow_id,
setup = setup_id,
tag = tag,
Expand Down
2 changes: 1 addition & 1 deletion R/list_oml_tasks.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ list_oml_tasks = function(task_id = NULL, data_id = NULL, number_instances = NUL
clust = "5"
)
}
tab = get_paginated_table("task",
tab = get_paginated_table("tasks",
task_id = task_id,
data_id = data_id,
number_instances = number_instances,
Expand Down
1 change: 1 addition & 0 deletions R/magic_numbers.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
magic_numbers = list(
# FIXME: These will not be be needed anymore / will change
oml_no_more_results = c(
data_set = 372L,
task = 482L,
Expand Down
2 changes: 1 addition & 1 deletion R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ get_desc_downloader = function(type) {
}

get_server = function(test_server) {
if (test_server) "https://test.openml.org/api/v1" else "https://www.openml.org/api/v1"
if (test_server) "https://test.openml.org/api/v1" else "https://test.openml.org/py"
}

catf_estimation_procedure = function(estimation_procedure) {
Expand Down
Binary file added inst/old/odata61.rds
Binary file not shown.
Binary file added inst/old/otask61.rds
Binary file not shown.
13 changes: 13 additions & 0 deletions inst/testthat/helper_expectation.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ expect_oml_data = function(data) {
expect_names(data$feature_names, "strict")
expect_subset(data$feature_names, colnames(data$data))
expect_disjunct(data$target_names, data$feature_names)
expect_character(data$desc$default_target_attribute, null.ok = TRUE)
expect_character(data$desc$row_id_attribute, null.ok = TRUE)
expect_character(data$desc$ignore_attribute, null.ok = TRUE)
expect_character(data$desc$creator, null.ok = TRUE)
expect_character(data$desc$contributor, null.ok = TRUE)
expect_character(data$desc$paper_url, null.ok = TRUE)
expect_character(data$desc$tag, null.ok = TRUE)
# can't do this because after OpenML's parquet transition some features seem to be missing
# expect_set_equal(names(data$data), c(data$feature_names, data$target_names))
expect_count(data$nrow)
Expand All @@ -65,6 +72,12 @@ expect_oml_data = function(data) {
}
expect_flag(data$parquet)
backend = as_data_backend(data)
if (length(data$desc$default_target_attribute)) {
expect_choice(data$desc$default_target_attribute, backend$colnames)
}
if (length(data$desc$ignore_attribute)) {
expect_choice(data$desc$ignore_attribute, backend$colnames)
}
expect_r6(backend, paste0("DataBackend"))
}

Expand Down
48 changes: 48 additions & 0 deletions old_objects.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
library(mlr3oml)

odata = odt(61)
odata$download()
saveRDS(odata, file.path("./inst/old/odata61.rds"))

otask = otsk(61)
otask$download()
saveRDS(otask, file.path("./inst/old/otask61.rds"))

oflow = oflw(1)
oflow$download()
saveRDS(oflow, file.path(".inst/old/l"))


test_that("new api works for OMLData", {
old = readRDS(system.file("old", "odata61.rds", package = "mlr3oml"))
new = odt(61, test_server = TRUE)
all.equal(old, new)

fields = unique(c(names(old$desc), names(new$desc)))

for (field in fields) {
if (!isTRUE(all.equal(old$desc[[field]], new$desc[[field]]))) {
print(field)
print("old:")
print(old$desc[[field]])
print("new:")
print(new$desc[[field]])
}
}
})

test_that("new api works for OMLTask", {
oldtask = readRDS(system.file("old", "otask61.rds", package = "mlr3oml"))
download_desc_task(61, get_server(FALSE))
newdesc = get_json(paste0(get_server(FALSE), "/tasks/61"),
simplify_data_frame = FALSE, server = get_server(FALSE)
)
path = tempfile()
response = download_file(paste0(get_server(FALSE), "/tasks/61"), path, server = get_server(FALSE))
x = readLines(path)
olddesc = oldtask$desc
oldtask$desc
newdesc
all.equal(old, new)

})
2 changes: 0 additions & 2 deletions tests/testthat/test_OMLData.R
Original file line number Diff line number Diff line change
Expand Up @@ -183,5 +183,3 @@ test_that("download runs without error", {
# expect_data_table(odata$data)
# #expect_class(odata$data[["Timestamp"]], "POSIXct")
#})