{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n" ] } ], "source": [ "print(1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "metadata": {} }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1\n" ] } ], "source": [ "print(1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib as mpl\n", "import mplcatppuccin\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "from plotly.subplots import make_subplots\n", "import plotly.figure_factory as ff\n", "import warnings\n", "import plotly.io as pio\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "pio.renderers.default = 'notebook_connected'\n", "pio.templates.default = 'plotly_dark'\n", "\n", "\n", "%matplotlib inline\n", "%config InlineBackend.figure_format = 'retina'\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.precision', 2)\n", "pd.set_option('display.colheader_justify', 'left')\n", "\n", "mpl.rc(\n", " 'figure',\n", " autolayout=True,\n", " titlesize=18\n", ")\n", "\n", "mpl.style.use(['ggplot', 'mocha'])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Header 3.2\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\n", " \"data.csv\",\n", " parse_dates=[\"release_date\"],\n", ")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "map_key = {\n", " 0: \"C\",\n", " 1: \"C#\",\n", " 2: \"D\",\n", " 3: \"D#\",\n", " 4: \"E\",\n", " 5: \"F\",\n", " 6: \"F#\",\n", " 7: \"G\",\n", " 8: \"G#\",\n", " 9: \"A\",\n", " 10: \"A#\",\n", " 11: \"B\",\n", "}\n", "\n", "map_mode = {1: \"Major\", 0: \"Minor\"}\n", "\n", "df[\"mode\"] = df[\"mode\"].map(map_mode)\n", "df[\"key\"] = df[\"key\"].map(map_key)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "check_dups = df[[\"artists\", \"name\"]]\n", "dups = check_dups[check_dups.duplicated()].index" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Before dropping duplicated:\", df.shape)\n", "df = df.drop(dups)\n", "print(\"After dropping duplicated:\", df.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.3\n", "\n", "## Header 2.3\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"duration_ms\"] = df[\"duration_ms\"] / 60_000\n", "df = df.rename(columns={\"duration_ms\": \"duration_min\"})\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.4\n", "\n", "## Header 2.4\n", "\n", "### Header 3.5" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.drop([\"release_date\", \"id\"], axis=1)\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def fun_subplots_plotly(df, col):\n", "\n", " mean_val = df[col].mean()\n", " median_val = df[col].median()\n", " fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])\n", "\n", " fig.add_trace(\n", " go.Histogram(\n", " x=df[col], name=f\"Histogram {col}\", marker={\"color\": \"#EBA0AC\"}, nbinsx=50\n", " ),\n", " row=1,\n", " col=1,\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=mean_val,\n", " y1=mean_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#A6E3A1\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_shape(\n", " type=\"line\",\n", " xref=\"paper\",\n", " yref=\"y\",\n", " x0=-1,\n", " x1=1,\n", " y0=median_val,\n", " y1=median_val,\n", " row=1,\n", " col=2,\n", " line={\"color\": \"#CBA6F7\", \"width\": 3, \"dash\": \"dot\"},\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=-0.7,\n", " y=mean_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Mean = {mean_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_annotation(\n", " xref=\"paper\",\n", " x=0.7,\n", " y=median_val,\n", " showarrow=True,\n", " arrowhead=2,\n", " text=f\"Median = {median_val:.2f}\",\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.add_trace(\n", " go.Box(y=df[col], name=f\"Boxplot {col}\", marker={\"color\": \"#F9E2AF\"}),\n", " row=1,\n", " col=2,\n", " )\n", "\n", " fig.update_layout(\n", " title={\"text\": f\"Distribution Plot {col}\", \"font\": {\"size\": 24}},\n", " legend={\n", " \"orientation\": \"h\",\n", " \"yanchor\": \"bottom\",\n", " \"y\": 1.02,\n", " \"xanchor\": \"right\",\n", " \"x\": 1,\n", " },\n", " )\n", " fig.update_yaxes(title_text=\"Count\", row=1, col=1)\n", " fig.update_yaxes(title_text=f\"{col}\", row=1, col=2)\n", " fig.update_xaxes(title_text=f\"{col}\", row=1, col=1)\n", " fig.update_xaxes(title_text=\"\", row=1, col=2)\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "music_vals = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in music_vals:\n", " fun_subplots_plotly(df, col)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "bins = pd.IntervalIndex.from_breaks(\n", " [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]\n", ")\n", "\n", "df[\"decade\"] = pd.cut(df[\"year\"], bins).astype(\"str\")\n", "\n", "map_decade = {\n", " \"(1920, 1930]\": \"1920s\",\n", " \"(1930, 1940]\": \"1930s\",\n", " \"(1940, 1950]\": \"1940s\",\n", " \"(1950, 1960]\": \"1950s\",\n", " \"(1960, 1970]\": \"1960s\",\n", " \"(1970, 1980]\": \"1970s\",\n", " \"(1980, 1990]\": \"1980s\",\n", " \"(1990, 2000]\": \"1990s\",\n", " \"(2000, 2010]\": \"2000s\",\n", " \"(2010, 2020]\": \"2010s\",\n", "}\n", "\n", "df[\"decade\"] = df[\"decade\"].map(map_decade)\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "df = df.query(\"duration_min <= 5\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1.2\n", "\n", "## Header 2.2\n", "\n", "### Header 3.4" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "decade_counts = (\n", " df[\"decade\"].value_counts().reset_index().sort_values(by=\"decade\", ascending=True)\n", ")\n", "\n", "fig = px.bar(\n", " data_frame=decade_counts,\n", " y=\"decade\",\n", " x=\"count\",\n", " color=\"count\",\n", " color_continuous_scale=px.colors.sequential.Sunset,\n", " text_auto=True,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Count of songs per decade (< 5min)\", \"font\": {\"size\": 24}},\n", " coloraxis_colorbar={\n", " \"title\": \"Count\",\n", " \"thicknessmode\": \"pixels\",\n", " \"thickness\": 20,\n", " \"lenmode\": \"pixels\",\n", " \"len\": 500,\n", " \"yanchor\": \"top\",\n", " \"xanchor\": \"right\",\n", " \"y\": 1.15,\n", " \"x\": 0.95,\n", " \"orientation\": \"h\",\n", " },\n", " margin={\"r\": 10},\n", ")\n", "\n", "fig.update_xaxes(title=\"Count\")\n", "fig.update_yaxes(title=\"Decades\")\n", "fig.show()\n", "\n", "del decade_counts" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"popularity\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "for col in cols:\n", " fig = px.histogram(\n", " data_frame=df,\n", " x=col,\n", " color=\"decade\",\n", " color_discrete_sequence=px.colors.qualitative.Light24,\n", " )\n", " fig.update_traces(opacity=0.7)\n", " fig.update_layout(\n", " bargap=0.1,\n", " bargroupgap=0.05,\n", " barmode=\"stack\",\n", " title={\"text\": f\"Histogram of {col} per decade\", \"font\": {\"size\": 24}},\n", " legend={\"title\": \"Decade\"},\n", " )\n", " fig.update_yaxes(title=\"Count\")\n", " fig.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "decade_mean = df.groupby(\"decade\")[cols].mean()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_mean.columns[0],\n", " decade_mean.columns[1],\n", " decade_mean.columns[2],\n", " decade_mean.columns[3],\n", " decade_mean.columns[4],\n", " decade_mean.columns[5],\n", " decade_mean.columns[6],\n", " decade_mean.columns[7],\n", " decade_mean.columns[8],\n", " decade_mean.columns[9],\n", " decade_mean.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 0],\n", " name=decade_mean.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 1],\n", " name=decade_mean.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 2],\n", " name=decade_mean.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 3],\n", " name=decade_mean.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 4],\n", " name=decade_mean.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 5],\n", " name=decade_mean.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 6],\n", " name=decade_mean.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 7],\n", " name=decade_mean.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 8],\n", " name=decade_mean.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 9],\n", " name=decade_mean.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_mean.index,\n", " x=decade_mean.iloc[:, 10],\n", " name=decade_mean.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Mean Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_mean" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "decade_median = df.groupby(\"decade\")[cols].median()\n", "\n", "fig = make_subplots(\n", " rows=4,\n", " cols=3,\n", " subplot_titles=(\n", " decade_median.columns[0],\n", " decade_median.columns[1],\n", " decade_median.columns[2],\n", " decade_median.columns[3],\n", " decade_median.columns[4],\n", " decade_median.columns[5],\n", " decade_median.columns[6],\n", " decade_median.columns[7],\n", " decade_median.columns[8],\n", " decade_median.columns[9],\n", " decade_median.columns[10],\n", " ),\n", " vertical_spacing=0.1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 0],\n", " name=decade_median.columns[0],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 1],\n", " name=decade_median.columns[1],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 2],\n", " name=decade_median.columns[2],\n", " orientation=\"h\",\n", " ),\n", " row=1,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 3],\n", " name=decade_median.columns[3],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 4],\n", " name=decade_median.columns[4],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 5],\n", " name=decade_median.columns[5],\n", " orientation=\"h\",\n", " ),\n", " row=2,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 6],\n", " name=decade_median.columns[6],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 7],\n", " name=decade_median.columns[7],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=2,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 8],\n", " name=decade_median.columns[8],\n", " orientation=\"h\",\n", " ),\n", " row=3,\n", " col=3,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 9],\n", " name=decade_median.columns[9],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=1,\n", ")\n", "\n", "fig.add_trace(\n", " go.Bar(\n", " y=decade_median.index,\n", " x=decade_median.iloc[:, 10],\n", " name=decade_median.columns[10],\n", " orientation=\"h\",\n", " ),\n", " row=4,\n", " col=2,\n", ")\n", "\n", "fig.update_layout(\n", " title={\"text\": \"Median Musical Values per Decade\", \"font\": {\"size\": 22}},\n", " legend={\n", " \"orientation\": \"v\",\n", " \"yanchor\": \"top\",\n", " \"y\": 0.2,\n", " \"xanchor\": \"right\",\n", " \"x\": 0.9,\n", " },\n", " margin={\"b\": 10, \"r\": 10},\n", " height=980,\n", ")\n", "\n", "fig.show()\n", "\n", "del decade_median" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "explicit_count = (\n", " df.groupby([\"decade\", \"explicit\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "key_count = (\n", " df.groupby([\"decade\", \"key\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "mode_count = (\n", " df.groupby([\"decade\", \"mode\"], as_index=False)[\"year\"]\n", " .count()\n", " .rename(columns={\"year\": \"count\"})\n", ")\n", "\n", "fig = px.histogram(\n", " data_frame=explicit_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"explicit\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Explicit Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=mode_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"mode\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Mode Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"group\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "fig = px.histogram(\n", " data_frame=key_count,\n", " x=\"decade\",\n", " y=\"count\",\n", " color=\"key\",\n", " histfunc=\"sum\",\n", " color_discrete_sequence=px.colors.qualitative.Pastel,\n", " text_auto=True,\n", ")\n", "fig.update_layout(\n", " title={\"text\": \"Key Count per Decade\", \"font\": {\"size\": 24}},\n", " bargap=0.2,\n", " bargroupgap=0.1,\n", " barmode=\"stack\",\n", " xaxis_title=\"Decade\",\n", " yaxis_title=\"Count\",\n", ")\n", "fig.show()\n", "\n", "del (mode_count, key_count, explicit_count)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "for col in cols:\n", "\n", " fig = px.scatter(\n", " data_frame=df,\n", " x=col,\n", " y=\"popularity\",\n", " trendline=\"ols\",\n", " trendline_color_override=\"#A6E3A1\",\n", " )\n", " fig.update_traces(marker={\"opacity\": 0.3, \"size\": 3, \"color\": \"#B4BEFE\"})\n", " fig.update_layout(title={\"text\": f\"{col} vs popularity\", \"font\": {\"size\": 24}})\n", " fig.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.regplot(\n", " data=df,\n", " x=\"popularity\",\n", " y=col,\n", " ax=ax,\n", " scatter_kws={\"alpha\": 0.5},\n", " line_kws={\"color\": \"#F38BA8\"},\n", " )\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", " \"popularity\",\n", "]\n", "\n", "plt.figure(figsize=(10, 7))\n", "mask = np.triu(np.ones_like(df[cols].corr()))\n", "sns.heatmap(\n", " df[cols].corr(),\n", " annot=True,\n", " fmt=\".2f\",\n", " cmap=\"Blues\",\n", " mask=mask,\n", ")\n", "plt.grid(visible=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "df[\"first_artist\"] = (\n", " df[\"artists\"]\n", " .apply(lambda x: str(x).split(\"x\")[0])\n", " .apply(lambda x: str(x).replace(\"[\", \"\"))\n", " .apply(lambda x: str(x).replace(\"]\", \"\"))\n", " .apply(lambda x: str(x).replace(\"'\", \"\"))\n", ")\n", "df = df.drop(\"artists\", axis=1)\n", "df[\"first_artist\"] = df[\"first_artist\"].apply(lambda x: str(x).split(\",\")[0])" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "popularity_check = (\n", " df.groupby([\"first_artist\"], as_index=False)[\"popularity\"]\n", " .agg({\"mean\", \"count\", \"median\"})\n", " .sort_values(by=\"mean\", ascending=False)\n", " .reset_index(drop=True)\n", ")\n", "\n", "popularity_check = popularity_check.drop(16091, axis=0)\n", "popularity_check = popularity_check.drop(16086, axis=0)\n", "popularity_check = popularity_check.drop(16429, axis=0)\n", "\n", "to_drop = popularity_check[popularity_check[\"count\"] <= 45].index\n", "popularity_check = popularity_check.drop(to_drop)\n", "popularity_check = popularity_check.sort_values(by=\"mean\", ascending=False).head(50)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "top_50_artists = pd.DataFrame()\n", "\n", "for i in popularity_check[\"first_artist\"].unique():\n", " artist = df.query(\"first_artist == @i\")\n", " top_50_artists = pd.concat([top_50_artists, artist], axis=0)\n", "\n", "top_50_artists = top_50_artists.reset_index(drop=True)\n", "top_50_artists.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "plt.figure(figsize=(12, 8))\n", "(\n", " top_50_artists[\"first_artist\"]\n", " .value_counts()\n", " .sort_values(ascending=True)\n", " .plot(kind=\"barh\")\n", ")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "top_50_artists[\"decade\"].value_counts().sort_values(ascending=True).plot(kind=\"barh\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [ "cols = [\n", " \"valence\",\n", " \"acousticness\",\n", " \"danceability\",\n", " \"duration_min\",\n", " \"energy\",\n", " \"instrumentalness\",\n", " \"liveness\",\n", " \"loudness\",\n", " \"year\",\n", " \"speechiness\",\n", " \"tempo\",\n", "]\n", "\n", "plt.figure(figsize=(15, 10))\n", "\n", "for i, col in enumerate(cols):\n", " ax = plt.subplot(3, 4, i + 1)\n", " sns.kdeplot(data=top_50_artists, x=col, hue=\"first_artist\", legend=False)\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Header 1\n", "\n", "## Header 2\n", "\n", "### Header 3.1" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "metadata": {} }, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 2 }