rfordatascience · jonthegeek · Nov 26, 2024 · Nov 25, 2024 · Nov 26, 2024 · Nov 26, 2024
diff --git a/.here b/.here
diff --git a/data/curated/parfumo_fragrance/cleaning.R b/data/curated/parfumo_fragrance/cleaning.R
@@ -0,0 +1,164 @@
+###_____________________________________________________________________________
+### Parfumo Fragrance
+### Script to clean the data sourced from Kaggle
+### Thanks to Olga G. Miufana!
+###_____________________________________________________________________________
+
+# packages
+library(httr)
+library(tidyverse)
+library(jsonlite)
+library(glue)
+library(janitor)
+library(here)
+
+# Define the metadata URL and fetch it
+metadata_url <- "https://www.kaggle.com/datasets/olgagmiufana1/parfumo-fragrance-dataset/croissant/download"
+response <- httr::GET(metadata_url)
+
+# Ensure the request succeeded
+if (httr::http_status(response)$category != "Success") {
+  stop("Failed to fetch metadata.")
+}
+
+# Parse the metadata
+metadata <- httr::content(response, as = "parsed", type = "application/json")
+
+# Locate the ZIP file URL
+distribution <- metadata$distribution
+zip_url <- NULL
+
+for (file in distribution) {
+  if (file$encodingFormat == "application/zip") {
+    zip_url <- file$contentUrl
+    break
+  }
+}
+
+if (is.null(zip_url)) {
+  stop("No ZIP file URL found in the metadata.")
+}
+
+# Download the ZIP file
+temp_file <- base::tempfile(fileext = ".zip")
+utils::download.file(zip_url, temp_file, mode = "wb")
+
+# Unzip and read the CSV
+unzip_dir <- base::tempdir()
+utils::unzip(temp_file, exdir = unzip_dir)
+
+# Locate the CSV file within the extracted contents
+csv_file <- list.files(unzip_dir, pattern = "\\.csv$", full.names = TRUE)
+
+if (length(csv_file) == 0) {
+  stop("No CSV file found in the unzipped contents.")
+}
+
+# Read the CSV into a dataframe
+parfumo_data <- read_csv(csv_file)
+
+# Step 5: Explore the data
+glimpse(parfumo_data)
+
+###_____________________________________________________________________________
+# Problems with the Data!
+
+# Some of the columns utilize "N/A" instead of a missing value
+# deal with these values efficiently using {tidyselect} and {dplyr}
+# additionally, the perfume names include the brand and sometimes the year
+# along with a perfume number
+# clean the perfume names so we have only the names as there are separate
+# variables for the brand and release year
+# use the perfume number as an additional column
+# there are some numeric columns that are encoded as character due to the "N/A"
+# values, we will deal with those
+# variables that could be numeric are encoded as strings due to the use of "N/A"
+# versus a true missing
+# fix!
+
+###_____________________________________________________________________________
+
+# we will create a few different objects while we clean 
+# to inspect and avoid errors
+
+parfumo_data_prep <- parfumo_data |> 
+  dplyr::mutate(
+
+    # replace the "N/A" values with true missings
+    dplyr::across(
+
+      tidyselect::everything(), ~ dplyr::if_else(
+
+        grepl(pattern = "\\bN/A\\b", x = .), NA_character_, .
+
+        )
+
+      )) |> 
+  dplyr::mutate(
+
+    # new perfume # variable
+    Number = stringr::str_extract(Name, pattern = "^#?\\s?\\d*\\.?\\d*?"),
+
+    # remove the preceding optional pound sign and space
+    Number = stringr::str_remove(Number, pattern = "^#?\\s?"),
+    .before = Name
+
+    ) |> 
+
+  # replace Number blanks with NA, as some have no # value
+  naniar::replace_with_na(replace = list(Number = "")) |> 
+  dplyr::mutate(
+
+    # create a temporary pattern variable to pick up the year
+    # and brand names that appear in the Name string to build
+    # dynamic regex for each row
+    # leverage the {glue} package, here
+
+    dynamic_pattern = glue::glue("(?:\\s{Brand})*(?:\\s{`Release Year`})?$"),
+
+    # clean the perfume name
+    Clean_Name = stringr::str_remove_all(Name, pattern = "^#?\\s?\\d*\\.?\\d*\\s?-?\\s?"),
+
+    # again for the Brand and Release Year removal
+
+    Clean_Name = stringr::str_remove_all(Clean_Name, pattern = dynamic_pattern),
+    Clean_Name = str_squish(Clean_Name),
+    .before = Name
+
+  ) |> 
+
+  dplyr::select(-dynamic_pattern, -Name)
+
+# inspect the object after initial cleaning
+
+glimpse(parfumo_data_prep)
+
+# a new dplyr chain to continue cleaning,
+# break it up to mutate and inspect to avoid errors
+
+parfumo_data_clean <- parfumo_data_prep |> 
+
+  dplyr::rename(Name = Clean_Name) |> 
+
+  # the rating value column as the word "Ratings" in it, preventing the column from being
+  # numeric, fix
+  # release year, rating value, and rating count are still character strings instead of numeric values
+  # fix!
+
+  dplyr::mutate(
+    `Rating Count` = stringr::str_remove_all(`Rating Count`, pattern = "\\sRatings$"),
+    across(c(`Release Year`, `Rating Value`, `Rating Count`), ~ as.numeric(.))
+  ) |> 
+
+  # clean the names so that they are easier to use in the future
+
+  clean_names(case = "title", sep_out = "_") |> 
+  rename(URL = Url)
+
+# inspect the final object
+
+glimpse(parfumo_data_clean)
+
+###_____________________________________________________________________________
+### End
+###_____________________________________________________________________________
diff --git a/data/curated/parfumo_fragrance/instructions.md b/data/curated/parfumo_fragrance/instructions.md
@@ -0,0 +1,30 @@
+## Prepare the dataset
+
+These instructions are for preparing a dataset using the R programming language.
+We hope to provide instructions for other programming languages eventually.
+
+If you have not yet set up your computer for submitting a dataset, please see the full instructions at <https://github.com/rfordatascience/tidytuesday/blob/master/.github/pr_instructions.md>.
+
+1.  `cleaning.R`: Modify the `cleaning.R` file to get and clean the data.
+    -   Write the code to download and clean the data in `cleaning.R`.
+    -   If you're getting the data from a github repo, remember to use the 'raw' version of the URL.
+    -   This script should result in one or more data.frames, with descriptive variable names (eg `players` and `teams`, not `df1` and `df2`).
+
+2.  `saving.R`: Use`saving.R` to save your datasets. This process creates both the `.csv` file(s) and the data dictionary template file(s) for your datasets. **Don't save the CSV files using a separate process because we also need the data dictionaries.**
+    -   Run the first line of `saving.R` to create the functions we'll use to save your dataset.
+    -   Provide the name of your directory as `dir_name`.
+    -   Use `ttsave()` for each dataset you created in `cleaning.R`, substituting the name for the dataset for `YOUR_DATASET_DF`.
+
+3.  `{dataset}.md`: Edit the `{dataset}.md` files to describe your datasets (where `{dataset}` is the name of the dataset). These files are created by `saving.R`. There should be one file for each of your datasets. You most likely only need to edit the "description" column to provide a description of each variable.
+
+4.  `intro.md`: Edit the `intro.md` file to describe your dataset. You don't need to add a `# Title` at the top; this is just a paragraph or two to introduce the week.
+
+5.  Find at least one image for your dataset. These often come from the article about your dataset. If you can't find an image, create an example data visualization, and save the images in your folder as `png` files.
+
+6.  `meta.yaml`: Edit `meta.yaml` to provide information about your dataset and how we can credit you. You can delete lines from the `credit` block that do not apply to you.
+
+### Submit your pull request with the data
+
+1.  Commit the changes with this folder to your branch. In RStudio, you can do this on the "Git" tab (the "Commit" button).
+
+2.  Submit a pull request to <https://github.com/rfordatascience/tidytuesday>. In R, you can do this with `usethis::pr_push()`, and then follow the instructions in your browser.
diff --git a/data/curated/parfumo_fragrance/intro.md b/data/curated/parfumo_fragrance/intro.md
@@ -0,0 +1,15 @@
+This week, we're diving into the fascinating world of fragrances with a dataset sourced from [Parfumo](https://www.parfumo.com/), a vibrant community of perfume enthusiasts.  [Olga G.](https://www.kaggle.com/olgagmiufana1) webscraped these data from the various fragrance sections on the above-referenced website. Here is a description from the author:
+
+> This dataset contains detailed information about perfumes sourced from Parfumo, obtained through web scraping. It includes data on perfume ratings, olfactory notes (top, middle, and base notes), perfumers, year of release and other relevant characteristics of the perfumes listed on the Parfumo website.
+
+> The data provides a comprehensive look at how various perfumes are rated, which families of scents they belong to, and detailed breakdowns of the key olfactory components that define their overall profile
+
+We’ll explore how perfumes are rated, uncover the scent families they belong to, and delve into the minds of the perfumers behind them. From the year of release to the delicate composition of each scent, this dataset offers a rich olfactory experience for anyone curious about the magic behind their favorite perfumes.
+
+Join us as we decode the stories within these perfumes, from the top notes that hit you first to the lasting base notes that linger in the air. Whether you're a fragrance aficionado or just curious about the data behind the scents, this exploration will open your eyes (and nose) to the artistry of perfume crafting. Ready to sniff out some data?
+
+Thank you to [Nicolas Foss](www.linkedin.com/in/nicolas-foss) this week's dataset.
+
+* What factors most influence the rating of a perfume?
+* Are there distinct scent families that dominate the market, and how are they perceived by users?
+* Has the popularity of certain fragrance notes evolved over time?
diff --git a/data/curated/parfumo_fragrance/meta.yaml b/data/curated/parfumo_fragrance/meta.yaml
@@ -0,0 +1,15 @@
+title: The Scent of Data - Exploring the Parfumo Fragrance Dataset
+data_source:
+  title: Parfumo Fragrance Dataset 
+  url: https://www.kaggle.com/datasets/olgagmiufana1/parfumo-fragrance-dataset
+images:
+- file: parfumo_data_scatter.png
+  alt: >
+    A scatter plot showing the relationship between the number of ratings (x-axis) and the average perfume rating (y-axis) from the Parfumo Fragrance Dataset. The color gradient indicates the rating, ranging from blue (low ratings) to orange (high ratings). Most data points cluster near the top-left, suggesting higher ratings with fewer reviews, while ratings spread out with increasing review counts. The title reads "Relationship Between Rating Count and Actual Rating," and a legend is provided to represent the color-coded ratings. The footer includes links to the creator's LinkedIn and GitHub profiles.
+- file: parfumo_data_bar.png
+  alt: > 
+    A bar plot illustrating the evolution of fragrance ratings from 1900 to the present, based on the Parfumo Fragrance Dataset. The x-axis represents the year of release, while the y-axis shows the number of releases (Release Count). Bars are color-coded by ratings count, with a gradient ranging from blue (lower ratings count) to orange (higher ratings count). The plot highlights a significant increase in releases starting around the mid-20th century, peaking in the early 2000s. The title reads "Evolution of Fragrance Ratings Years 1900 - Present," with a legend representing the ratings count gradient. The footer includes LinkedIn and GitHub links to the creator's profiles.
+credit:
+  post: Nicolas Foss, Ed.D., MS | Bureau of Emergency Medical and Trauma Services > Iowa HHS
+  linkedin: https://www.linkedin.com/in/nicolas-foss
+  github: https://github.com/nicolasfoss
diff --git a/data/curated/parfumo_fragrance/parfumo_data_bar.png b/data/curated/parfumo_fragrance/parfumo_data_bar.png