From 3cef0cf4629101e4914bab3dd5d134750ef6da3e Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Fri, 9 Jul 2021 18:20:45 +0200 Subject: [PATCH 1/8] datasets --- src/openml.jl | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/openml.jl b/src/openml.jl index 26c6f8a..b19d1a4 100644 --- a/src/openml.jl +++ b/src/openml.jl @@ -257,6 +257,29 @@ function load_List_And_Filter(filters::String; api_key::String = "") return nothing end +function todf(entry) + if length(entry["quality"]) > 0 + dq = vcat(DataFrame.(entry["quality"])...) + dq.id = fill(entry["did"], nrow(dq)) + dq = unstack(dq, :id, :name, :value) + else + dq = DataFrame() + end + hcat(DataFrame([k => entry[k] for k in keys(entry) + if k ∉ ("did", "quality", "file_id", "md5_checksum")]), dq) +end + +""" + datasets(filter = ""; api_key = "") + +List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of the filter. +""" +function datasets(filter = ""; api_key = "") + data = MLJOpenML.load_List_And_Filter(filter; api_key) + df = reduce(vcat, todf.(data["data"]["dataset"]), cols = :union) + select(df, :id, Not([:id, :format, :version]), :format, :version) +end + # Flow API # Task API From c75b6ce330756d03e736a49416b859eb467c8f47 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Fri, 9 Jul 2021 18:31:34 +0200 Subject: [PATCH 2/8] fix missing id --- src/openml.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openml.jl b/src/openml.jl index b19d1a4..1ba3e89 100644 --- a/src/openml.jl +++ b/src/openml.jl @@ -263,7 +263,7 @@ function todf(entry) dq.id = fill(entry["did"], nrow(dq)) dq = unstack(dq, :id, :name, :value) else - dq = DataFrame() + dq = DataFrame(id = [entry["did"]]) end hcat(DataFrame([k => entry[k] for k in keys(entry) if k ∉ ("did", "quality", "file_id", "md5_checksum")]), dq) From 50a70f45fce5e751d338040afde15c5978451d23 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 13 Jul 2021 11:53:15 +0200 Subject: [PATCH 3/8] drop DataFrames --- src/openml.jl | 49 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 16 deletions(-) diff --git a/src/openml.jl b/src/openml.jl index 1ba3e89..f266014 100644 --- a/src/openml.jl +++ b/src/openml.jl @@ -257,29 +257,46 @@ function load_List_And_Filter(filters::String; api_key::String = "") return nothing end -function todf(entry) - if length(entry["quality"]) > 0 - dq = vcat(DataFrame.(entry["quality"])...) - dq.id = fill(entry["did"], nrow(dq)) - dq = unstack(dq, :id, :name, :value) - else - dq = DataFrame(id = [entry["did"]]) - end - hcat(DataFrame([k => entry[k] for k in keys(entry) - if k ∉ ("did", "quality", "file_id", "md5_checksum")]), dq) -end +qualitynames(x) = haskey(x, "name") ? [x["name"]] : [] """ - datasets(filter = ""; api_key = "") + list_datasets(filter = ""; api_key = "", output_format = NamedTuple) -List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of the filter. +List OpenML list_datasets. See [`load_List_And_Filter`](@ref) for the format of +the filter. As an alternative `output_format` one can choose other table types, +like `DataFrame`, if the `DataFrames` package is loaded. """ -function datasets(filter = ""; api_key = "") +function list_datasets(filter = ""; api_key = "", output_format = NamedTuple) data = MLJOpenML.load_List_And_Filter(filter; api_key) - df = reduce(vcat, todf.(data["data"]["dataset"]), cols = :union) - select(df, :id, Not([:id, :format, :version]), :format, :version) + datasets = data["data"]["dataset"] + qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...))) + result = merge((id = Int[], name = String[], status = String[]), + NamedTuple{tuple(qualities...)}(ntuple(i -> Union{Missing, Int}[], length(qualities)))) + for entry in datasets + push!(result.id, entry["did"]) + push!(result.name, entry["name"]) + push!(result.status, entry["status"]) + for quality in entry["quality"] + push!(getproperty(result, Symbol(quality["name"])), + Meta.parse(quality["value"])) + end + for quality in qualities + if length(getproperty(result, quality)) < length(result.id) + push!(getproperty(result, quality), missing) + end + end + end + output_format(result) end +""" + describe_dataset(id) + +Load and show the OpenML description of the data set `id`. +Use [`list_datasets`](@ref) to browse available data sets. +""" +describe_dataset(id) = Text(load_Dataset_Description(id)["data_set_description"]["description"]) + # Flow API # Task API From 4bb654313480590b6c891c7521625e77a3b7ad97 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Tue, 13 Jul 2021 12:06:39 +0200 Subject: [PATCH 4/8] improve docstrings --- src/openml.jl | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/src/openml.jl b/src/openml.jl index f266014..326b573 100644 --- a/src/openml.jl +++ b/src/openml.jl @@ -262,9 +262,18 @@ qualitynames(x) = haskey(x, "name") ? [x["name"]] : [] """ list_datasets(filter = ""; api_key = "", output_format = NamedTuple) -List OpenML list_datasets. See [`load_List_And_Filter`](@ref) for the format of +List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of the filter. As an alternative `output_format` one can choose other table types, like `DataFrame`, if the `DataFrames` package is loaded. + +# Examples +``` +julia> using DataFrames + +julia> ds = MLJOpenML.list_datasets("/tag/OpenML100/", output_format = DataFrame) + +julia> sort!(ds, :NumberOfFeatures) +``` """ function list_datasets(filter = ""; api_key = "", output_format = NamedTuple) data = MLJOpenML.load_List_And_Filter(filter; api_key) @@ -294,6 +303,28 @@ end Load and show the OpenML description of the data set `id`. Use [`list_datasets`](@ref) to browse available data sets. + +# Examples +``` +julia> MLJOpenML.describe_dataset(6) +**Author**: David J. Slate +**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 +**Please cite**: P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". Machine Learning 6(2), 1991 + +1. TITLE: + Letter Image Recognition Data + + The objective is to identify each of a large number of black-and-white + rectangular pixel displays as one of the 26 capital letters in the English + alphabet. The character images were based on 20 different fonts and each + letter within these 20 fonts was randomly distorted to produce a file of + 20,000 unique stimuli. Each stimulus was converted into 16 primitive + numerical attributes (statistical moments and edge counts) which were then + scaled to fit into a range of integer values from 0 through 15. We + typically train on the first 16000 items and then use the resulting model + to predict the letter category for the remaining 4000. See the article + cited above for more details. +``` """ describe_dataset(id) = Text(load_Dataset_Description(id)["data_set_description"]["description"]) From ab12219a03b30b29e0b0879c804106f921420b39 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Wed, 14 Jul 2021 14:24:53 +0200 Subject: [PATCH 5/8] use markdown --- src/openml.jl | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/openml.jl b/src/openml.jl index 326b573..6d4659d 100644 --- a/src/openml.jl +++ b/src/openml.jl @@ -1,5 +1,6 @@ using HTTP using JSON +using Markdown const API_URL = "https://www.openml.org/api/v1/json" @@ -307,26 +308,28 @@ Use [`list_datasets`](@ref) to browse available data sets. # Examples ``` julia> MLJOpenML.describe_dataset(6) -**Author**: David J. Slate -**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 -**Please cite**: P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". Machine Learning 6(2), 1991 + Author: David J. Slate Source: UCI + (https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 Please cite: P. + W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". + Machine Learning 6(2), 1991 + + 1. TITLE: -1. TITLE: Letter Image Recognition Data - The objective is to identify each of a large number of black-and-white - rectangular pixel displays as one of the 26 capital letters in the English - alphabet. The character images were based on 20 different fonts and each - letter within these 20 fonts was randomly distorted to produce a file of - 20,000 unique stimuli. Each stimulus was converted into 16 primitive - numerical attributes (statistical moments and edge counts) which were then - scaled to fit into a range of integer values from 0 through 15. We - typically train on the first 16000 items and then use the resulting model - to predict the letter category for the remaining 4000. See the article - cited above for more details. + The objective is to identify each of a large number of black-and-white + rectangular pixel displays as one of the 26 capital letters in the English + alphabet. The character images were based on 20 different fonts and each + letter within these 20 fonts was randomly distorted to produce a file of + 20,000 unique stimuli. Each stimulus was converted into 16 primitive + numerical attributes (statistical moments and edge counts) which were then + scaled to fit into a range of integer values from 0 through 15. We + typically train on the first 16000 items and then use the resulting model + to predict the letter category for the remaining 4000. See the article + cited above for more details. ``` """ -describe_dataset(id) = Text(load_Dataset_Description(id)["data_set_description"]["description"]) +describe_dataset(id) = Markdown.parse(load_Dataset_Description(id)["data_set_description"]["description"]) # Flow API From 07956cfbf7d8f1a64f52fe82dd7587ab25ec302e Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Wed, 14 Jul 2021 16:59:26 +0200 Subject: [PATCH 6/8] fix invalid keyword argument for old julia versions --- src/openml.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openml.jl b/src/openml.jl index 6d4659d..0df532f 100644 --- a/src/openml.jl +++ b/src/openml.jl @@ -277,7 +277,7 @@ julia> sort!(ds, :NumberOfFeatures) ``` """ function list_datasets(filter = ""; api_key = "", output_format = NamedTuple) - data = MLJOpenML.load_List_And_Filter(filter; api_key) + data = MLJOpenML.load_List_And_Filter(filter; api_key = api_key) datasets = data["data"]["dataset"] qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...))) result = merge((id = Int[], name = String[], status = String[]), From c1a25dbe969fc88b4890120d9030b026beae9e83 Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Wed, 14 Jul 2021 16:59:55 +0200 Subject: [PATCH 7/8] add Markdown --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index 3817e8f..7af4594 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "1.0.0" [deps] HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a" [compat] HTTP = "^0.8, 0.9" From e0f6ba745a7326f33aa73071c08e4973f3b8ccaf Mon Sep 17 00:00:00 2001 From: Johanni Brea Date: Sat, 17 Jul 2021 11:45:32 +0200 Subject: [PATCH 8/8] improve doc and tag handling --- README.md | 12 ++++++++- src/openml.jl | 74 ++++++++++++++++++++++++++++----------------------- 2 files changed, 51 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index aafff7b..213223e 100644 --- a/README.md +++ b/README.md @@ -29,12 +29,22 @@ rowtable = MLJOpenML.load(61) Convert to a `DataFrame`: -``` +```julia Pkg.add("DataFrames") using DataFrames df = DataFrame(rowtable) ``` +To browse datasets use + +```julia +using DataFrames +ds = MLJOpenML.list_datasets(output_format = DataFrame) +MLJOpenML.describe_dataset(6) +MLJOpenML.list_tags() # lists valid tags +ds100 = MLJOpenML.list_datasets(tag = "OpenML100", output_format = DataFrame) +``` + ## Documentation Documentation is provided in the [OpenML diff --git a/src/openml.jl b/src/openml.jl index 0df532f..a069df3 100644 --- a/src/openml.jl +++ b/src/openml.jl @@ -206,33 +206,9 @@ function load_Data_Qualities(id::Int; api_key::String = "") end """ -List datasets, possibly filtered by a range of properties. -Any number of properties can be combined by listing them one after -the other in the -form '/data/list/{filter}/{value}/{filter}/{value}/...' -Returns an array with all datasets that match the constraints. - -Any combination of these filters /limit/{limit}/offset/{offset} - -returns only {limit} results starting from result number {offset}. -Useful for paginating results. With /limit/5/offset/10, - results 11..15 will be returned. - -Both limit and offset need to be specified. -/status/{status} - returns only datasets with a given status, -either 'active', 'deactivated', or 'in_preparation'. -/tag/{tag} - returns only datasets tagged with the given tag. -/{data_quality}/{range} - returns only tasks for which the -underlying datasets have certain qualities. -{data_quality} can be data_id, data_name, data_version, number_instances, -number_features, number_classes, number_missing_values. {range} can be a -specific value or a range in the form 'low..high'. -Multiple qualities can be combined, as in -'number_instances/0..50/number_features/0..10'. - -- 370 - Illegal filter specified. -- 371 - Filter values/ranges not properly specified. -- 372 - No results. There where no matches for the given constraints. -- 373 - Can not specify an offset without a limit. + load_List_And_Filter(filters; api_key = "") + +See [OpenML API](https://www.openml.org/api_docs#!/data/get_data_list_filters). """ function load_List_And_Filter(filters::String; api_key::String = "") if api_key == "" @@ -261,23 +237,35 @@ end qualitynames(x) = haskey(x, "name") ? [x["name"]] : [] """ - list_datasets(filter = ""; api_key = "", output_format = NamedTuple) + list_datasets(; tag = nothing, filters = "" api_key = "", output_format = NamedTuple) -List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of -the filter. As an alternative `output_format` one can choose other table types, -like `DataFrame`, if the `DataFrames` package is loaded. +Lists all active OpenML datasets, if `tag = nothing` (default). +To list only datasets with a given tag, choose one of the tags in [`list_tags()`](@ref). +An alternative `output_format` can be chosen, e.g. `DataFrame`, if the +`DataFrames` package is loaded. Choose `filters` as specified in the official +[openml API](https://www.openml.org/api_docs#!/data/get_data_list_filters) +(caveat: this function does not check for valid filters). # Examples ``` julia> using DataFrames -julia> ds = MLJOpenML.list_datasets("/tag/OpenML100/", output_format = DataFrame) +julia> ds = MLJOpenML.list_datasets(tag = "OpenML100", output_format = DataFrame) julia> sort!(ds, :NumberOfFeatures) ``` """ -function list_datasets(filter = ""; api_key = "", output_format = NamedTuple) - data = MLJOpenML.load_List_And_Filter(filter; api_key = api_key) +function list_datasets(; tag = nothing, filters = "", + api_key = "", output_format = NamedTuple) + if tag !== nothing + if is_valid_tag(tag) + filters *= "/tag/$tag" + else + @warn "$tag is not a valid tag. See `list_tags()` for a list of tags." + return + end + end + data = MLJOpenML.load_List_And_Filter(filters; api_key = api_key) datasets = data["data"]["dataset"] qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...))) result = merge((id = Int[], name = String[], status = String[]), @@ -299,6 +287,24 @@ function list_datasets(filter = ""; api_key = "", output_format = NamedTuple) output_format(result) end +is_valid_tag(tag::String) = tag ∈ list_tags() +is_valid_tag(tag) = false + +""" + list_tags() + +List all available tags. +""" +function list_tags() + url = string(API_URL, "/data/tag/list") + try + r = HTTP.request("GET", url) + return JSON.parse(String(r.body))["data_tag_list"]["tag"] + catch + return nothing + end +end + """ describe_dataset(id)