From 3cef0cf4629101e4914bab3dd5d134750ef6da3e Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Fri, 9 Jul 2021 18:20:45 +0200
Subject: [PATCH 1/8] datasets

---
 src/openml.jl | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/src/openml.jl b/src/openml.jl
index 26c6f8a..b19d1a4 100644
--- a/src/openml.jl
+++ b/src/openml.jl
@@ -257,6 +257,29 @@ function load_List_And_Filter(filters::String; api_key::String = "")
     return nothing
 end
 
+function todf(entry)
+    if length(entry["quality"]) > 0
+        dq = vcat(DataFrame.(entry["quality"])...)
+        dq.id = fill(entry["did"], nrow(dq))
+        dq = unstack(dq, :id, :name, :value)
+    else
+        dq = DataFrame()
+    end
+    hcat(DataFrame([k => entry[k] for k in keys(entry)
+                   if k ∉ ("did", "quality", "file_id", "md5_checksum")]), dq)
+end
+
+"""
+    datasets(filter = ""; api_key = "")
+
+List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of the filter.
+"""
+function datasets(filter = ""; api_key = "")
+    data = MLJOpenML.load_List_And_Filter(filter; api_key)
+    df = reduce(vcat, todf.(data["data"]["dataset"]), cols = :union)
+    select(df, :id, Not([:id, :format, :version]), :format, :version)
+end
+
 # Flow API
 
 # Task API

From c75b6ce330756d03e736a49416b859eb467c8f47 Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Fri, 9 Jul 2021 18:31:34 +0200
Subject: [PATCH 2/8] fix missing id

---
 src/openml.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/openml.jl b/src/openml.jl
index b19d1a4..1ba3e89 100644
--- a/src/openml.jl
+++ b/src/openml.jl
@@ -263,7 +263,7 @@ function todf(entry)
         dq.id = fill(entry["did"], nrow(dq))
         dq = unstack(dq, :id, :name, :value)
     else
-        dq = DataFrame()
+        dq = DataFrame(id = [entry["did"]])
     end
     hcat(DataFrame([k => entry[k] for k in keys(entry)
                    if k ∉ ("did", "quality", "file_id", "md5_checksum")]), dq)

From 50a70f45fce5e751d338040afde15c5978451d23 Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Tue, 13 Jul 2021 11:53:15 +0200
Subject: [PATCH 3/8] drop DataFrames

---
 src/openml.jl | 49 +++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 16 deletions(-)

diff --git a/src/openml.jl b/src/openml.jl
index 1ba3e89..f266014 100644
--- a/src/openml.jl
+++ b/src/openml.jl
@@ -257,29 +257,46 @@ function load_List_And_Filter(filters::String; api_key::String = "")
     return nothing
 end
 
-function todf(entry)
-    if length(entry["quality"]) > 0
-        dq = vcat(DataFrame.(entry["quality"])...)
-        dq.id = fill(entry["did"], nrow(dq))
-        dq = unstack(dq, :id, :name, :value)
-    else
-        dq = DataFrame(id = [entry["did"]])
-    end
-    hcat(DataFrame([k => entry[k] for k in keys(entry)
-                   if k ∉ ("did", "quality", "file_id", "md5_checksum")]), dq)
-end
+qualitynames(x) = haskey(x, "name") ? [x["name"]] : []
 
 """
-    datasets(filter = ""; api_key = "")
+    list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
 
-List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of the filter.
+List OpenML list_datasets. See [`load_List_And_Filter`](@ref) for the format of
+the filter. As an alternative `output_format` one can choose other table types,
+like `DataFrame`, if the `DataFrames` package is loaded.
 """
-function datasets(filter = ""; api_key = "")
+function list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
     data = MLJOpenML.load_List_And_Filter(filter; api_key)
-    df = reduce(vcat, todf.(data["data"]["dataset"]), cols = :union)
-    select(df, :id, Not([:id, :format, :version]), :format, :version)
+    datasets = data["data"]["dataset"]
+    qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...)))
+    result = merge((id = Int[], name = String[], status = String[]),
+                   NamedTuple{tuple(qualities...)}(ntuple(i -> Union{Missing, Int}[], length(qualities))))
+    for entry in datasets
+        push!(result.id, entry["did"])
+        push!(result.name, entry["name"])
+        push!(result.status, entry["status"])
+        for quality in entry["quality"]
+            push!(getproperty(result, Symbol(quality["name"])),
+                  Meta.parse(quality["value"]))
+        end
+        for quality in qualities
+            if length(getproperty(result, quality)) < length(result.id)
+                push!(getproperty(result, quality), missing)
+            end
+        end
+    end
+    output_format(result)
 end
 
+"""
+    describe_dataset(id)
+
+Load and show the OpenML description of the data set `id`.
+Use [`list_datasets`](@ref) to browse available data sets.
+"""
+describe_dataset(id) =  Text(load_Dataset_Description(id)["data_set_description"]["description"])
+
 # Flow API
 
 # Task API

From 4bb654313480590b6c891c7521625e77a3b7ad97 Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Tue, 13 Jul 2021 12:06:39 +0200
Subject: [PATCH 4/8] improve docstrings

---
 src/openml.jl | 33 ++++++++++++++++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/src/openml.jl b/src/openml.jl
index f266014..326b573 100644
--- a/src/openml.jl
+++ b/src/openml.jl
@@ -262,9 +262,18 @@ qualitynames(x) = haskey(x, "name") ? [x["name"]] : []
 """
     list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
 
-List OpenML list_datasets. See [`load_List_And_Filter`](@ref) for the format of
+List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of
 the filter. As an alternative `output_format` one can choose other table types,
 like `DataFrame`, if the `DataFrames` package is loaded.
+
+# Examples
+```
+julia> using DataFrames
+
+julia> ds = MLJOpenML.list_datasets("/tag/OpenML100/", output_format = DataFrame)
+
+julia> sort!(ds, :NumberOfFeatures)
+```
 """
 function list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
     data = MLJOpenML.load_List_And_Filter(filter; api_key)
@@ -294,6 +303,28 @@ end
 
 Load and show the OpenML description of the data set `id`.
 Use [`list_datasets`](@ref) to browse available data sets.
+
+# Examples
+```
+julia> MLJOpenML.describe_dataset(6)
+**Author**: David J. Slate
+**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991
+**Please cite**: P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". Machine Learning 6(2), 1991
+
+1. TITLE:
+  Letter Image Recognition Data
+
+    The objective is to identify each of a large number of black-and-white
+    rectangular pixel displays as one of the 26 capital letters in the English
+    alphabet.  The character images were based on 20 different fonts and each
+    letter within these 20 fonts was randomly distorted to produce a file of
+    20,000 unique stimuli.  Each stimulus was converted into 16 primitive
+    numerical attributes (statistical moments and edge counts) which were then
+    scaled to fit into a range of integer values from 0 through 15.  We
+    typically train on the first 16000 items and then use the resulting model
+    to predict the letter category for the remaining 4000.  See the article
+    cited above for more details.
+```
 """
 describe_dataset(id) =  Text(load_Dataset_Description(id)["data_set_description"]["description"])
 

From ab12219a03b30b29e0b0879c804106f921420b39 Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Wed, 14 Jul 2021 14:24:53 +0200
Subject: [PATCH 5/8] use markdown

---
 src/openml.jl | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/src/openml.jl b/src/openml.jl
index 326b573..6d4659d 100644
--- a/src/openml.jl
+++ b/src/openml.jl
@@ -1,5 +1,6 @@
 using HTTP
 using JSON
+using Markdown
 
 const API_URL = "https://www.openml.org/api/v1/json"
 
@@ -307,26 +308,28 @@ Use [`list_datasets`](@ref) to browse available data sets.
 # Examples
 ```
 julia> MLJOpenML.describe_dataset(6)
-**Author**: David J. Slate
-**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991
-**Please cite**: P. W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers". Machine Learning 6(2), 1991
+  Author: David J. Slate Source: UCI
+  (https://archive.ics.uci.edu/ml/datasets/Letter+Recognition) - 01-01-1991 Please cite: P.
+  W. Frey and D. J. Slate. "Letter Recognition Using Holland-style Adaptive Classifiers".
+  Machine Learning 6(2), 1991
+
+    1. TITLE:
 
-1. TITLE:
   Letter Image Recognition Data
 
-    The objective is to identify each of a large number of black-and-white
-    rectangular pixel displays as one of the 26 capital letters in the English
-    alphabet.  The character images were based on 20 different fonts and each
-    letter within these 20 fonts was randomly distorted to produce a file of
-    20,000 unique stimuli.  Each stimulus was converted into 16 primitive
-    numerical attributes (statistical moments and edge counts) which were then
-    scaled to fit into a range of integer values from 0 through 15.  We
-    typically train on the first 16000 items and then use the resulting model
-    to predict the letter category for the remaining 4000.  See the article
-    cited above for more details.
+  The objective is to identify each of a large number of black-and-white
+  rectangular pixel displays as one of the 26 capital letters in the English
+  alphabet.  The character images were based on 20 different fonts and each
+  letter within these 20 fonts was randomly distorted to produce a file of
+  20,000 unique stimuli.  Each stimulus was converted into 16 primitive
+  numerical attributes (statistical moments and edge counts) which were then
+  scaled to fit into a range of integer values from 0 through 15.  We
+  typically train on the first 16000 items and then use the resulting model
+  to predict the letter category for the remaining 4000.  See the article
+  cited above for more details.
 ```
 """
-describe_dataset(id) =  Text(load_Dataset_Description(id)["data_set_description"]["description"])
+describe_dataset(id) =  Markdown.parse(load_Dataset_Description(id)["data_set_description"]["description"])
 
 # Flow API
 

From 07956cfbf7d8f1a64f52fe82dd7587ab25ec302e Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Wed, 14 Jul 2021 16:59:26 +0200
Subject: [PATCH 6/8] fix invalid keyword argument for old julia versions

---
 src/openml.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/openml.jl b/src/openml.jl
index 6d4659d..0df532f 100644
--- a/src/openml.jl
+++ b/src/openml.jl
@@ -277,7 +277,7 @@ julia> sort!(ds, :NumberOfFeatures)
 ```
 """
 function list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
-    data = MLJOpenML.load_List_And_Filter(filter; api_key)
+    data = MLJOpenML.load_List_And_Filter(filter; api_key = api_key)
     datasets = data["data"]["dataset"]
     qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...)))
     result = merge((id = Int[], name = String[], status = String[]),

From c1a25dbe969fc88b4890120d9030b026beae9e83 Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Wed, 14 Jul 2021 16:59:55 +0200
Subject: [PATCH 7/8] add Markdown

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index 3817e8f..7af4594 100644
--- a/Project.toml
+++ b/Project.toml
@@ -6,6 +6,7 @@ version = "1.0.0"
 [deps]
 HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+Markdown = "d6f4376e-aef5-505a-96c1-9c027394607a"
 
 [compat]
 HTTP = "^0.8, 0.9"

From e0f6ba745a7326f33aa73071c08e4973f3b8ccaf Mon Sep 17 00:00:00 2001
From: Johanni Brea <jbrea@users.noreply.github.com>
Date: Sat, 17 Jul 2021 11:45:32 +0200
Subject: [PATCH 8/8] improve doc and tag handling

---
 README.md     | 12 ++++++++-
 src/openml.jl | 74 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index aafff7b..213223e 100644
--- a/README.md
+++ b/README.md
@@ -29,12 +29,22 @@ rowtable = MLJOpenML.load(61)
 
 Convert to a `DataFrame`:
 
-```
+```julia
 Pkg.add("DataFrames")
 using DataFrames
 df = DataFrame(rowtable)
 ```
 
+To browse datasets use
+
+```julia
+using DataFrames
+ds = MLJOpenML.list_datasets(output_format = DataFrame)
+MLJOpenML.describe_dataset(6)
+MLJOpenML.list_tags() # lists valid tags
+ds100 = MLJOpenML.list_datasets(tag = "OpenML100", output_format = DataFrame)
+```
+
 ## Documentation
 
 Documentation is provided in the [OpenML
diff --git a/src/openml.jl b/src/openml.jl
index 0df532f..a069df3 100644
--- a/src/openml.jl
+++ b/src/openml.jl
@@ -206,33 +206,9 @@ function load_Data_Qualities(id::Int; api_key::String = "")
 end
 
 """
-List datasets, possibly filtered by a range of properties.
-Any number of properties can be combined by listing them one after
-the other in the
-form '/data/list/{filter}/{value}/{filter}/{value}/...'
-Returns an array with all datasets that match the constraints.
-
-Any combination of these filters /limit/{limit}/offset/{offset} -
-returns only {limit} results starting from result number {offset}.
-Useful for paginating results. With /limit/5/offset/10,
-    results 11..15 will be returned.
-
-Both limit and offset need to be specified.
-/status/{status} - returns only datasets with a given status,
-either 'active', 'deactivated', or 'in_preparation'.
-/tag/{tag} - returns only datasets tagged with the given tag.
-/{data_quality}/{range} - returns only tasks for which the
-underlying datasets have certain qualities.
-{data_quality} can be data_id, data_name, data_version, number_instances,
-number_features, number_classes, number_missing_values. {range} can be a
-specific value or a range in the form 'low..high'.
-Multiple qualities can be combined, as in
-'number_instances/0..50/number_features/0..10'.
-
-- 370 - Illegal filter specified.
-- 371 - Filter values/ranges not properly specified.
-- 372 - No results. There where no matches for the given constraints.
-- 373 - Can not specify an offset without a limit.
+    load_List_And_Filter(filters; api_key = "")
+
+See [OpenML API](https://www.openml.org/api_docs#!/data/get_data_list_filters).
 """
 function load_List_And_Filter(filters::String; api_key::String = "")
     if api_key == ""
@@ -261,23 +237,35 @@ end
 qualitynames(x) = haskey(x, "name") ? [x["name"]] : []
 
 """
-    list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
+    list_datasets(; tag = nothing, filters = "" api_key = "", output_format = NamedTuple)
 
-List OpenML datasets. See [`load_List_And_Filter`](@ref) for the format of
-the filter. As an alternative `output_format` one can choose other table types,
-like `DataFrame`, if the `DataFrames` package is loaded.
+Lists all active OpenML datasets, if `tag = nothing` (default).
+To list only datasets with a given tag, choose one of the tags in [`list_tags()`](@ref).
+An alternative `output_format` can be chosen, e.g. `DataFrame`, if the
+`DataFrames` package is loaded. Choose `filters` as specified in the official
+[openml API](https://www.openml.org/api_docs#!/data/get_data_list_filters)
+(caveat: this function does not check for valid filters).
 
 # Examples
 ```
 julia> using DataFrames
 
-julia> ds = MLJOpenML.list_datasets("/tag/OpenML100/", output_format = DataFrame)
+julia> ds = MLJOpenML.list_datasets(tag = "OpenML100", output_format = DataFrame)
 
 julia> sort!(ds, :NumberOfFeatures)
 ```
 """
-function list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
-    data = MLJOpenML.load_List_And_Filter(filter; api_key = api_key)
+function list_datasets(; tag = nothing, filters = "",
+                         api_key = "", output_format = NamedTuple)
+    if tag !== nothing
+        if is_valid_tag(tag)
+            filters *= "/tag/$tag"
+        else
+            @warn "$tag is not a valid tag. See `list_tags()` for a list of tags."
+            return
+        end
+    end
+    data = MLJOpenML.load_List_And_Filter(filters; api_key = api_key)
     datasets = data["data"]["dataset"]
     qualities = Symbol.(union(vcat([vcat(qualitynames.(entry["quality"])...) for entry in datasets]...)))
     result = merge((id = Int[], name = String[], status = String[]),
@@ -299,6 +287,24 @@ function list_datasets(filter = ""; api_key = "", output_format = NamedTuple)
     output_format(result)
 end
 
+is_valid_tag(tag::String) = tag ∈ list_tags()
+is_valid_tag(tag) = false
+
+"""
+    list_tags()
+
+List all available tags.
+"""
+function list_tags()
+    url = string(API_URL, "/data/tag/list")
+    try
+        r = HTTP.request("GET", url)
+        return JSON.parse(String(r.body))["data_tag_list"]["tag"]
+    catch
+        return nothing
+    end
+end
+
 """
     describe_dataset(id)