-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.py
172 lines (129 loc) · 5.39 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# Read in dataset
import pandas as pd
import numpy as np
import plotly
import plotly.graph_objs as go
from plotly.offline import plot
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import warnings
warnings.filterwarnings("ignore")
apps_with_duplicates = pd.read_csv(r"C:\Users\aasth\OneDrive\Desktop\Datacamp project\apps.csv")
# print('Total number of apps in the dataset duplicates = ', len(apps_with_duplicates))
# Drop duplicates from apps_with_duplicates
apps = apps_with_duplicates.drop_duplicates()
# Print the total number of apps
print('Total number of apps in the dataset = ', len(apps))
# Have a look at a random sample of 5 rows
n = 5
print(apps.sample(n))
# List of characters to remove
chars_to_remove = ['+', ',', '$']
# List of column names to clean
cols_to_clean = ['Installs', 'Price']
# Loop for each column in cols_to_clean
for col in cols_to_clean:
# Loop for each char in chars_to_remove
for char in chars_to_remove:
# Replace the character with an empty string
apps[col] = apps[col].apply(lambda x: x.replace(char, ''))
# Print a summary of the apps dataframe
print(apps.info)
# print(type(apps['Price'][2]))
# Convert Installs to float data type
apps['Installs'] = apps['Installs'].astype(float)
# Convert Price to float data type
apps['Price'] = apps['Price'].astype(float)
# Checking dtypes of the apps dataframe
print(apps.dtypes)
# print(type(apps['Price'][2]))
# Print the total number of unique categories
num_categories = len(apps['Category'].unique())
print('Number of categories = ', num_categories)
# Count the number of apps in each 'Category'.
num_apps_in_category = apps['Category'].value_counts()
# Sort num_apps_in_category in descending order based on the count of apps in each category
sorted_num_apps_in_category = num_apps_in_category.sort_values(ascending = False)
print(sorted_num_apps_in_category)
fig = go.Figure(data=go.Bar(x= num_apps_in_category.index, y=num_apps_in_category.values))
# Plot the figure
# fig.show()
# Average rating of apps
avg_app_rating = apps['Rating'].mean()
print('Average app rating = ', avg_app_rating)
# Distribution of apps according to their ratings
data = go.Figure(data=go.Histogram(x = apps['Rating']))
# Vertical dashed line to indicate the average app rating
layout = {'shapes': [{
'type' :'line',
'x0': avg_app_rating,
'y0': 0,
'x1': avg_app_rating,
'y1': 1000,
'line': { 'dash': 'dashdot'}
}]
}
# data.show()
# Select rows where both 'Rating' and 'Size' values are present (ie. the two values are not null)
apps_with_size_and_rating_present = apps[(~apps['Rating'].isnull()) & (~apps['Size'].isnull())]
# print(len(apps['Rating'].isnull()))
# Subset for categories with at least 250 apps
large_categories = apps_with_size_and_rating_present.groupby(['Category']).filter(lambda x: len(x) >= 250)
# Plot size vs. rating
plt1 = sns.jointplot(x = large_categories['Size'], y = large_categories['Rating'])
# plt1.show()
# Select apps whose 'Type' is 'Paid'
paid_apps = apps_with_size_and_rating_present[apps_with_size_and_rating_present['Type'] == 'Paid']
# Plot price vs. rating
plt2 = sns.jointplot(x = paid_apps['Price'], y = paid_apps['Rating'])
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
# Select a few popular app categories
popular_app_cats = apps[apps.Category.isin(['GAME', 'FAMILY', 'PHOTOGRAPHY',
'MEDICAL', 'TOOLS', 'FINANCE',
'LIFESTYLE','BUSINESS'])]
# Examine the price trend by plotting Price vs Category
ax = sns.stripplot(x = popular_app_cats['Price'], y = popular_app_cats['Category'], jitter=True, linewidth=1)
ax.set_title('App pricing trend across categories')
# Apps whose Price is greater than 200
apps_above_200 = popular_app_cats[popular_app_cats['Price'] > 200]
apps_above_200[['Category', 'App', 'Price']]
# Select apps priced below $100
apps_under_100 = popular_app_cats[popular_app_cats['Price'] < 100]
fig, ax = plt.subplots()
fig.set_size_inches(15, 8)
# Examine price vs category with the authentic apps (apps_under_100)
ax = sns.stripplot(x = 'Price', y = 'Category', data = apps_under_100, jitter = True, linewidth = 1)
ax.set_title('App pricing trend across categories after filtering for junk apps')
trace0 = go.Box(
# Data for paid apps
y = apps[apps['Type'] == 'Paid']['Installs'],
name = 'Paid'
)
trace1 = go.Box(
# Data for free apps
y = apps[apps['Type'] == 'Free']['Installs'],
name = 'Free'
)
layout = go.Layout(
title = "Number of downloads of paid apps vs. free apps",
yaxis = dict(title = "Log number of downloads",
type = 'log',
autorange = True)
)
# Add trace0 and trace1 to a list
data = [trace0, trace1]
# plotly.offline.plot({'data': data, 'layout': layout})
# Load user_reviews.csv
reviews_df = pd.read_csv(r'C:\Users\aasth\OneDrive\Desktop\Datacamp project\user_reviews.csv')
# Join the two dataframes
merged_df = pd.merge(apps, reviews_df, on = "App")
# Drop NA values from Sentiment and Review columns
merged_df = merged_df.dropna(subset = ['Sentiment', 'Reviews'])
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(11, 8)
# User review sentiment polarity for paid vs. free apps
ax = sns.boxplot(x = 'Type', y = 'Sentiment_Polarity', data = merged_df)
ax.set_title('Sentiment Polarity Distribution')