# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [1]:
print(1)

1


In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

In [1]:
print(1)

1


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)

df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import mplcatppuccin
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import warnings
import plotly.io as pio

warnings.filterwarnings('ignore')

pio.renderers.default = 'notebook_connected'
pio.templates.default = 'plotly_dark'


%matplotlib inline
%config InlineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', None)
pd.set_option('display.precision', 2)
pd.set_option('display.colheader_justify', 'left')

mpl.rc(
 'figure',
 autolayout=True,
 titlesize=18
)

mpl.style.use(['ggplot', 'mocha'])


### Header 3.2


In [None]:
df = pd.read_csv(
 "data.csv",
 parse_dates=["release_date"],
)
df.head()

In [None]:
map_key = {
 0: "C",
 1: "C#",
 2: "D",
 3: "D#",
 4: "E",
 5: "F",
 6: "F#",
 7: "G",
 8: "G#",
 9: "A",
 10: "A#",
 11: "B",
}

map_mode = {1: "Major", 0: "Minor"}

df["mode"] = df["mode"].map(map_mode)
df["key"] = df["key"].map(map_key)

In [None]:
check_dups = df[["artists", "name"]]
dups = check_dups[check_dups.duplicated()].index

In [None]:
print("Before dropping duplicated:", df.shape)
df = df.drop(dups)
print("After dropping duplicated:", df.shape)

# Header 1.3

## Header 2.3

### Header 3.5

In [None]:
df["duration_ms"] = df["duration_ms"] / 60_000
df = df.rename(columns={"duration_ms": "duration_min"})
df.head()

# Header 1.4

## Header 2.4

### Header 3.5

In [None]:
df = df.drop(["release_date", "id"], axis=1)
df.head()

In [None]:
def fun_subplots_plotly(df, col):

 mean_val = df[col].mean()
 median_val = df[col].median()
 fig = make_subplots(rows=1, cols=2, column_widths=[0.75, 0.25])

 fig.add_trace(
 go.Histogram(
 x=df[col], name=f"Histogram {col}", marker={"color": "#EBA0AC"}, nbinsx=50
 ),
 row=1,
 col=1,
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=mean_val,
 y1=mean_val,
 row=1,
 col=2,
 line={"color": "#A6E3A1", "width": 3, "dash": "dot"},
 )

 fig.add_shape(
 type="line",
 xref="paper",
 yref="y",
 x0=-1,
 x1=1,
 y0=median_val,
 y1=median_val,
 row=1,
 col=2,
 line={"color": "#CBA6F7", "width": 3, "dash": "dot"},
 )

 fig.add_annotation(
 xref="paper",
 x=-0.7,
 y=mean_val,
 showarrow=True,
 arrowhead=2,
 text=f"Mean = {mean_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_annotation(
 xref="paper",
 x=0.7,
 y=median_val,
 showarrow=True,
 arrowhead=2,
 text=f"Median = {median_val:.2f}",
 row=1,
 col=2,
 )

 fig.add_trace(
 go.Box(y=df[col], name=f"Boxplot {col}", marker={"color": "#F9E2AF"}),
 row=1,
 col=2,
 )

 fig.update_layout(
 title={"text": f"Distribution Plot {col}", "font": {"size": 24}},
 legend={
 "orientation": "h",
 "yanchor": "bottom",
 "y": 1.02,
 "xanchor": "right",
 "x": 1,
 },
 )
 fig.update_yaxes(title_text="Count", row=1, col=1)
 fig.update_yaxes(title_text=f"{col}", row=1, col=2)
 fig.update_xaxes(title_text=f"{col}", row=1, col=1)
 fig.update_xaxes(title_text="", row=1, col=2)
 fig.show()

In [None]:
music_vals = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]

for col in music_vals:
 fun_subplots_plotly(df, col)

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
bins = pd.IntervalIndex.from_breaks(
 [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010, 2020]
)

df["decade"] = pd.cut(df["year"], bins).astype("str")

map_decade = {
 "(1920, 1930]": "1920s",
 "(1930, 1940]": "1930s",
 "(1940, 1950]": "1940s",
 "(1950, 1960]": "1950s",
 "(1960, 1970]": "1960s",
 "(1970, 1980]": "1970s",
 "(1980, 1990]": "1980s",
 "(1990, 2000]": "1990s",
 "(2000, 2010]": "2000s",
 "(2010, 2020]": "2010s",
}

df["decade"] = df["decade"].map(map_decade)

df.head()

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
df = df.query("duration_min <= 5")

# Header 1.2

## Header 2.2

### Header 3.4

In [None]:
decade_counts = (
 df["decade"].value_counts().reset_index().sort_values(by="decade", ascending=True)
)

fig = px.bar(
 data_frame=decade_counts,
 y="decade",
 x="count",
 color="count",
 color_continuous_scale=px.colors.sequential.Sunset,
 text_auto=True,
)

fig.update_layout(
 title={"text": "Count of songs per decade (< 5min)", "font": {"size": 24}},
 coloraxis_colorbar={
 "title": "Count",
 "thicknessmode": "pixels",
 "thickness": 20,
 "lenmode": "pixels",
 "len": 500,
 "yanchor": "top",
 "xanchor": "right",
 "y": 1.15,
 "x": 0.95,
 "orientation": "h",
 },
 margin={"r": 10},
)

fig.update_xaxes(title="Count")
fig.update_yaxes(title="Decades")
fig.show()

del decade_counts

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "popularity",
 "speechiness",
 "tempo",
]
for col in cols:
 fig = px.histogram(
 data_frame=df,
 x=col,
 color="decade",
 color_discrete_sequence=px.colors.qualitative.Light24,
 )
 fig.update_traces(opacity=0.7)
 fig.update_layout(
 bargap=0.1,
 bargroupgap=0.05,
 barmode="stack",
 title={"text": f"Histogram of {col} per decade", "font": {"size": 24}},
 legend={"title": "Decade"},
 )
 fig.update_yaxes(title="Count")
 fig.show()

In [None]:
decade_mean = df.groupby("decade")[cols].mean()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_mean.columns[0],
 decade_mean.columns[1],
 decade_mean.columns[2],
 decade_mean.columns[3],
 decade_mean.columns[4],
 decade_mean.columns[5],
 decade_mean.columns[6],
 decade_mean.columns[7],
 decade_mean.columns[8],
 decade_mean.columns[9],
 decade_mean.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 0],
 name=decade_mean.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 1],
 name=decade_mean.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 2],
 name=decade_mean.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 3],
 name=decade_mean.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 4],
 name=decade_mean.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 5],
 name=decade_mean.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 6],
 name=decade_mean.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 7],
 name=decade_mean.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 8],
 name=decade_mean.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 9],
 name=decade_mean.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_mean.index,
 x=decade_mean.iloc[:, 10],
 name=decade_mean.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Mean Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_mean

In [None]:
decade_median = df.groupby("decade")[cols].median()

fig = make_subplots(
 rows=4,
 cols=3,
 subplot_titles=(
 decade_median.columns[0],
 decade_median.columns[1],
 decade_median.columns[2],
 decade_median.columns[3],
 decade_median.columns[4],
 decade_median.columns[5],
 decade_median.columns[6],
 decade_median.columns[7],
 decade_median.columns[8],
 decade_median.columns[9],
 decade_median.columns[10],
 ),
 vertical_spacing=0.1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 0],
 name=decade_median.columns[0],
 orientation="h",
 ),
 row=1,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 1],
 name=decade_median.columns[1],
 orientation="h",
 ),
 row=1,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 2],
 name=decade_median.columns[2],
 orientation="h",
 ),
 row=1,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 3],
 name=decade_median.columns[3],
 orientation="h",
 ),
 row=2,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 4],
 name=decade_median.columns[4],
 orientation="h",
 ),
 row=2,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 5],
 name=decade_median.columns[5],
 orientation="h",
 ),
 row=2,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 6],
 name=decade_median.columns[6],
 orientation="h",
 ),
 row=3,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 7],
 name=decade_median.columns[7],
 orientation="h",
 ),
 row=3,
 col=2,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 8],
 name=decade_median.columns[8],
 orientation="h",
 ),
 row=3,
 col=3,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 9],
 name=decade_median.columns[9],
 orientation="h",
 ),
 row=4,
 col=1,
)

fig.add_trace(
 go.Bar(
 y=decade_median.index,
 x=decade_median.iloc[:, 10],
 name=decade_median.columns[10],
 orientation="h",
 ),
 row=4,
 col=2,
)

fig.update_layout(
 title={"text": "Median Musical Values per Decade", "font": {"size": 22}},
 legend={
 "orientation": "v",
 "yanchor": "top",
 "y": 0.2,
 "xanchor": "right",
 "x": 0.9,
 },
 margin={"b": 10, "r": 10},
 height=980,
)

fig.show()

del decade_median

In [None]:
explicit_count = (
 df.groupby(["decade", "explicit"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
key_count = (
 df.groupby(["decade", "key"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)
mode_count = (
 df.groupby(["decade", "mode"], as_index=False)["year"]
 .count()
 .rename(columns={"year": "count"})
)

fig = px.histogram(
 data_frame=explicit_count,
 x="decade",
 y="count",
 color="explicit",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Explicit Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=mode_count,
 x="decade",
 y="count",
 color="mode",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Mode Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="group",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

fig = px.histogram(
 data_frame=key_count,
 x="decade",
 y="count",
 color="key",
 histfunc="sum",
 color_discrete_sequence=px.colors.qualitative.Pastel,
 text_auto=True,
)
fig.update_layout(
 title={"text": "Key Count per Decade", "font": {"size": 24}},
 bargap=0.2,
 bargroupgap=0.1,
 barmode="stack",
 xaxis_title="Decade",
 yaxis_title="Count",
)
fig.show()

del (mode_count, key_count, explicit_count)

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

for col in cols:

 fig = px.scatter(
 data_frame=df,
 x=col,
 y="popularity",
 trendline="ols",
 trendline_color_override="#A6E3A1",
 )
 fig.update_traces(marker={"opacity": 0.3, "size": 3, "color": "#B4BEFE"})
 fig.update_layout(title={"text": f"{col} vs popularity", "font": {"size": 24}})
 fig.show()

# Header 1

## Header 2

### Header 3.1

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.regplot(
 data=df,
 x="popularity",
 y=col,
 ax=ax,
 scatter_kws={"alpha": 0.5},
 line_kws={"color": "#F38BA8"},
 )

plt.tight_layout()
plt.show()

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
 "popularity",
]

plt.figure(figsize=(10, 7))
mask = np.triu(np.ones_like(df[cols].corr()))
sns.heatmap(
 df[cols].corr(),
 annot=True,
 fmt=".2f",
 cmap="Blues",
 mask=mask,
)
plt.grid(visible=False)

In [None]:
df["first_artist"] = (
 df["artists"]
 .apply(lambda x: str(x).split("x")[0])
 .apply(lambda x: str(x).replace("[", ""))
 .apply(lambda x: str(x).replace("]", ""))
 .apply(lambda x: str(x).replace("'", ""))
)
df = df.drop("artists", axis=1)
df["first_artist"] = df["first_artist"].apply(lambda x: str(x).split(",")[0])

In [None]:
popularity_check = (
 df.groupby(["first_artist"], as_index=False)["popularity"]
 .agg({"mean", "count", "median"})
 .sort_values(by="mean", ascending=False)
 .reset_index(drop=True)
)

popularity_check = popularity_check.drop(16091, axis=0)
popularity_check = popularity_check.drop(16086, axis=0)
popularity_check = popularity_check.drop(16429, axis=0)

to_drop = popularity_check[popularity_check["count"] <= 45].index
popularity_check = popularity_check.drop(to_drop)
popularity_check = popularity_check.sort_values(by="mean", ascending=False).head(50)

# Header 1

## Header 2

### Header 3.1

In [None]:
top_50_artists = pd.DataFrame()

for i in popularity_check["first_artist"].unique():
 artist = df.query("first_artist == @i")
 top_50_artists = pd.concat([top_50_artists, artist], axis=0)

top_50_artists = top_50_artists.reset_index(drop=True)
top_50_artists.head()

In [None]:
plt.figure(figsize=(12, 8))
(
 top_50_artists["first_artist"]
 .value_counts()
 .sort_values(ascending=True)
 .plot(kind="barh")
)
plt.show()

In [None]:
top_50_artists["decade"].value_counts().sort_values(ascending=True).plot(kind="barh")

In [None]:
cols = [
 "valence",
 "acousticness",
 "danceability",
 "duration_min",
 "energy",
 "instrumentalness",
 "liveness",
 "loudness",
 "year",
 "speechiness",
 "tempo",
]

plt.figure(figsize=(15, 10))

for i, col in enumerate(cols):
 ax = plt.subplot(3, 4, i + 1)
 sns.kdeplot(data=top_50_artists, x=col, hue="first_artist", legend=False)

plt.tight_layout()
plt.show()

# Header 1

## Header 2

### Header 3.1