import pandas as pd import matplotlib.pyplot as plt # Problem set 6: Getting, cleaning, and checking your data # Part A. Getting your data. # In this project I am importing three CSVs: salaries-us.csv, which was scraped from HackerRank in 2016, salaries-ar.csv, which is from 2023 in Argentina, eand salaries-india.csv, which was scraped from GlassDoor in 2022. The datasets include varying and detailed kinds of information like pay rates, geographical relevance, job titles and their categories, as well as categories less relevant to our analysis like ratings or comments. I am using pandas to read the CSVs into dataframes. # Use pandas to read the CSV file into a DataFrame us_eu_salaries = pd.read_csv('source/salaries-us.csv') in_salaries = pd.read_csv('source/salaries-in.csv') ar_salaries = pd.read_csv('source/salaries-ar.csv') # RESEARCH QUESTIONS # Questions for each country: # How does compensation vary between countries depending on the following: # the job title? E.g. How much more do Android Devs make than Web Devs, etc.? # the company? E.g. Does Apple pay significantly more than Expedia for the same role? # location? E.g. What's the opportunity cost of working in Bangalore vs Delhi, or Miami vs New York? # other factors? E.g. Does having more experience mean you get paid more? Does having more experience at the same company mean you get paid more? # TABLE PLAN # TABLE 1: Tech Salaries: # [employer_name] [location_name] [location_state] [location_country] [job_title] [job_title_category] [job_title_rank] [total_experience_years] [employer_experience_years] [annual base pay] [signing bonus] [annual bonus] [stock_value_bonus] [total compensation] # TABLE 2: Indian Salaries: # [employer_name] [location_name] [job_title] [annual base pay] [employment_status] [job_roles] # All the currencies will be converted into USD for ease of comparison. I will be using the current conversion rate, which is 1 Indian Rupee equals 0.012 United States Dollar. # TABLE 3: Both: # From there, I will merge the two tables into a new table that has the following columns. I will be more strict about removing interns so that it will only be a comparison of full-time employees across India and western countries. # [employer_name] [location_name] [job_title] [annual base pay] # 3. Write the code needed to take the data you loaded in and produce the table or tables that you want for your preliminary analysis plan. By the end of this part of your script you should have a dataframe or a few dataframes that are appropriate for your analysis. (Again, most likely you will revisit and revise this code later when you are actually doing the analysis and realize you need to change the data cleaning “upstream.”) # First, I need to clean both datasets so that they are more or less equivalent. I will do this by standardizing the column names, removing unnecessary columns, and converting the currencies into USD. I will also try to standardize the location format so that I can compare them more easily. #So I can see it better pd.set_option('display.max_columns', None) # TECH SALARIES # To start, there were several vulgar/spam/unvalidated entries under category "Other", and some jobs that weren't tech related whatsoever that I wanted to remove. However, some "Other" entries were also real and valuable. I checked what would happen if I had a strict approach to data cleaning: # Apply the query to get the new DataFrame # strict_us_eu_salaries = us_eu_salaries.query('(job_title_category != "Other")' ) # print(strict_us_eu_salaries.head()) # print("Shape of the strict-Other-filtered DataFrame:", strict_us_eu_salaries.shape) #That removed quite a lot of relevant entries (1655 -> 1366 entries), which means we should take a more careful approach. I noticed that many of the real "Other" jobs had a longitude and latitude, so I modified my query as follows: query_string = ( '(job_title_category != "Other") | ((job_title_category == "Other") & (location_latitude.notnull() & location_longitude.notnull()))' ) # Apply the query to get the new DataFrame filt_us_eu_salaries = us_eu_salaries.query(query_string) #That cut things down to 1500 entries and removed most of the spam, which was a happy trade-off for me. # I then added the 13th column (total compensation, but only if there are valid values for it) compensation = ['annual_base_pay', 'signing_bonus', 'annual_bonus', 'stock_value_bonus'] filt_us_eu_salaries = filt_us_eu_salaries.copy() # Create a copy to avoid the warning filt_us_eu_salaries['total'] = filt_us_eu_salaries[compensation].apply(lambda row: sum(val for val in row if pd.notna(val) and isinstance(val, (int, float))), axis=1) # Added a column field to replace index filt_us_eu_salaries['country'] = 'US/EU' # Move the 'country' column to position 0 columns = ['country'] + [col for col in filt_us_eu_salaries.columns if col != 'country'] filt_us_eu_salaries = filt_us_eu_salaries[columns] # I finished off cleaning this dataset by removing columns irrelevant to our analysis going forward. columns_to_drop = ['index', 'job_title_category', 'job_title_rank', 'salary_id','location_latitude', 'location_longitude', 'comments', 'submitted_at'] us_eu_df = filt_us_eu_salaries.drop(columns=columns_to_drop) # INDIA SALARIES # There were basically no spam or errors in this dataset. The biggest thing to resolve for the Indian salary dataset was "Number of Salaries Reported" -- I wanted that to flatten out so that every salary reported had its own row. # Create a new DataFrame by repeating rows based on the "Salaries Reported" column india_df = in_salaries.loc[in_salaries.index.repeat(in_salaries['Salaries Reported'])] # Reset the index to get a new default integer index india_df.reset_index(drop=True, inplace=True) # Drop the "Salaries Reported" column india_df.drop('Salaries Reported', axis=1, inplace=True) # I also renamed columns to match the format the other dataframe: india_df.rename(columns={ 'Company Name': 'employer_name', 'Location': 'location_name', 'Job Title': 'job_title', 'Salary': 'annual_base_pay', 'Employment Status': 'employment_status', 'Job Roles': 'job_roles' }, inplace=True) #Convert the values in the "annual_base_pay" column to USD currency # Conversion rate from Indian Rupee to USD conversion_rate = 0.012 # Convert values in the "annual_base_pay" column to USD india_df['annual_base_pay'] = india_df['annual_base_pay'] * conversion_rate # Replace Ratings Column with India india_df['country'] = 'India' india_df = india_df.rename(columns={india_df.columns[0]: 'old_column_name'}) # Move the 'country' column to position 0 columns = ['country'] + [col for col in india_df.columns if col != 'country'] india_df = india_df[columns] # Drop the original first column india_df = india_df.drop(columns=['old_column_name']) #And I was happy with that! # ARGENTINA SALARIES # This dataset needed to be translated, and for irrelevant text-based columns to be removed. # 1. Keep only selected columns selected_columns = ar_salaries.iloc[:, [0, 1, 2, 3, 4, 17, 18, 19, 20, 21, 22, 30, 35, 36, 37, 45, 46, 48]] ar_df = ar_salaries.iloc[:, [0, 1, 2, 3, 4, 17, 18, 19, 20, 21, 22, 30, 35, 36, 37, 45, 46, 48]] #I'm using iloc here because it's way too hard to remember the spanish names, sorry # 2. Rename columns to English equivalent english_columns = [ "employment_status", "location_name", "dedication", "contract_type", "monthly_salary_or_withdrawal", "satisfaction_with_income", "looking_for_job", "job_title", "years_of_experience", "seniority_in_company", "time_in_position", "work_modality", "state", "career", "university", "age", "gender_identity", "seniority" ] ar_df.columns = english_columns # 3. Loop through entries to roughly translate responses translations = { 'Ciudad Autónoma de Buenos Aires': 'Buenos Aires City', 'Remoto (empresa de otro país)': 'Remote (foreign company)', 'Tercerizado (trabajo a través de consultora o agencia)': 'Contract', 'Staff (planta permanente)': 'Staff (permanent)', 'Sí, estoy buscando activamente.': 'Yes, I am actively looking.', 'No, estoy muy conforme.': 'No, I am very satisfied.', 'No, pero escucho ofertas.': 'No, but I am open to offers.', 'Híbrido (presencial y remoto)': 'Hybrid (in-person and remote)', 'En curso': 'In progress', 'Completo': 'Completed', 'Incompleto': 'Incomplete', 'Universitario': 'University degree', 'Terciario': 'Tertiary degree', 'Secundario': 'Secondary degree', 'Varón Cis': 'Male', 'Mujer Cis': 'Female', } # Create a copy of the DataFrame to avoid SettingWithCopyWarning ar_df = ar_df.copy() # Convert to USD and multiply by 12 exchange_rate = 0.0028 ar_df["annual_base_pay"] = ar_df["monthly_salary_or_withdrawal"] * exchange_rate * 12 # Drop the original column ar_df.drop(columns=["monthly_salary_or_withdrawal"], inplace=True) ar_df.rename(columns={"employment_status": "country"}, inplace=True) # Loop through each entry to replace Spanish to English for column in ar_df.columns: ar_df.loc[:, column] = ar_df[column].replace(translations) # Make CSVs ar_df.to_csv('cleaned/salaries-ar-cleaned.csv', index=False) india_df.to_csv('cleaned/salaries-in-cleaned.csv', index=False) us_eu_df.to_csv('cleaned/salaries-us-cleaned.csv', index=False) # CHECKING DATA # Distribution Histogram def perform_data_checks(df, df_name): column = "annual_base_pay" plt.hist(df[column], bins=50) # You can adjust the number of bins as needed plt.title(f"Annual Base Pay Distribution in {df_name}") plt.xlabel(column) plt.ylabel("Frequency") plt.show() #After I ran the histograms, I found that there were several outlier values to remove which were distorting the data and making the scale go all the way from 0 to 1.0 in 1e7 (i.e. go up to the 10 millions), and concentrating everything on the left in a sharp spike. I will remove them here by using IQR to remove outliers (and I will also remove any rows with empty values that might still be hanging around). def remove_empty_and_outliers_iqr(df, column, multiplier=6): # Calculate Q1, Q3, and IQR q1 = df[column].quantile(0.25) q3 = df[column].quantile(0.75) iqr = q3 - q1 # Define the lower and upper bounds for outlier detection lower_bound = q1 - multiplier * iqr upper_bound = q3 + multiplier * iqr # Remove outliers df_no_outliers = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] # Remove empty values df_cleaned = df_no_outliers[(df_no_outliers['annual_base_pay'].notna()) & (df_no_outliers['annual_base_pay'] != 0)] return df_cleaned # # Remove outliers us_eu_df = remove_empty_and_outliers_iqr(us_eu_df, "annual_base_pay") india_df = remove_empty_and_outliers_iqr(india_df, "annual_base_pay") ar_df = remove_empty_and_outliers_iqr(ar_df, "annual_base_pay") #I'm happy with this and will now proceed with my actual data analysis. #DATA ANALYSIS #1 Given the same role (Developer / Engineer), how much does each country pay on average? # Concatenate the dataframes into one for easier processing all_df = pd.concat([us_eu_df, india_df, ar_df]) # Filter rows containing 'Developer' or 'Engineer' in the job_title column (case-insensitive) developer_engineer_df = all_df[all_df['job_title'].str.contains('developer|engineer', case=False, regex=True)] # Group by country and compute the average pay, as well as the count of entries average_pay_by_country = developer_engineer_df.groupby('country')['annual_base_pay'].agg(['mean', 'count']).reset_index() average_pay_by_country.columns = ['country', 'annual_base_pay', 'entry_count'] print(average_pay_by_country.head()) # Plot the bar graph plt.figure(figsize=(10, 6)) plt.bar(average_pay_by_country['country'], average_pay_by_country['annual_base_pay'], color=['blue', 'orange', 'green']) plt.xlabel('Country') plt.ylabel('Average Annual Base Pay') plt.title('Average Pay for Developer/Engineer Roles by Country') plt.show() #1.5 compared with cost of living expenses_data = { 'Country': ['US/EU', 'India', 'Argentina'], 'Monthly Expenses (USD)': [3040.1, 596.8, 715.38], 'Yearly Expenses (USD)': [36481.2, 7161.6, 8584.56] } expenses_df = pd.DataFrame(expenses_data) # Merge average pay and expenses data merged_df = pd.merge(average_pay_by_country, expenses_df, left_on='country', right_on='Country', how='left') print(merged_df.head()) # Extract relevant data for plotting countries = merged_df['country'] base_pay = merged_df['annual_base_pay'] yearly_expenses = merged_df['Yearly Expenses (USD)'] # Set up the bar positions bar_width = 0.35 index = range(len(countries)) # Plot the bars plt.bar(index, base_pay, bar_width, label='Annual Base Pay') plt.bar([i + bar_width for i in index], yearly_expenses, bar_width, label='Yearly Expenses') # Customize the plot plt.xlabel('Country') plt.ylabel('Amount (USD)') plt.title('Annual Base Pay and Yearly Expenses by Country') plt.xticks([i + bar_width/2 for i in index], countries) plt.legend() # Show the plot plt.show() #2-3 What are the top 5 highest paid positions and regions in each country? def top_pivot(df, column_name): # Compute average annual base pay and count for each unique value in the specified column avg_pay_and_count_by_column = df.groupby(column_name)['annual_base_pay'].agg(['mean', 'count']) # Create a new dataframe with the computed averages and counts avg_pay_df = pd.DataFrame(avg_pay_and_count_by_column).reset_index() # Sort the dataframe by average pay in descending order avg_pay_df = avg_pay_df.sort_values(by='mean', ascending=False) # Take the top 5 highest paid positions top_5_positions = avg_pay_df.head(5) # Create a pivot table to rank the top 5 positions pivot_table = top_5_positions.pivot_table(index=column_name, values='mean', aggfunc=['mean', 'count']) # Sort the pivot table in descending order by average pay pivot_table = pivot_table.sort_values(by=('mean', 'mean'), ascending=False) return pivot_table # Top 5 highest paid positions in each country us_eu_top_paid_positions = top_pivot(us_eu_df, "job_title") print("\nUS/EU Top 5 Highest Paid Positions:") print(us_eu_top_paid_positions) india_top_paid_positions = top_pivot(india_df, "job_title") print("\nIndia Top 5 Highest Paid Positions:") print(india_top_paid_positions) ar_top_paid_positions = top_pivot(ar_df, "job_title") print("\nArgentina Top 5 Highest Paid Positions:") print(ar_top_paid_positions) # Top 5 highest paid regions in each country us_eu_top_regions = top_pivot(us_eu_df, "location_name") print("\nUS/EU Top 5 Highest Paid Locations:") print(us_eu_top_regions) india_top_regions = top_pivot(india_df, "location_name") print("\nIndia Top 5 Highest Paid Locations:") print(india_top_regions) ar_top_regions = top_pivot(ar_df, "location_name") print("\nArgentina Top 5 Highest Paid Locations:") print(ar_top_regions) # Top 5 highest paid companies in each country # us_eu_top_companies = top_pivot(us_eu_df, "employer_name") # print("\nUS/EU Top 5 Highest Paying Companies:") # print(us_eu_top_companies) # india_top_companies = top_pivot(india_df, "employer_name") # print("\nIndia Top 5 Highest Paying Companies:") # print(india_top_companies) #4 What is the difference between pay for different kinds of developers in India? def plot_average_pay(df, country_name): # Get the most common job titles most_common_job_titles = df['job_title'].value_counts().head(10).index # Filter the DataFrame to include only the most common job titles filtered_df = df[df['job_title'].isin(most_common_job_titles)] # Group by job title and compute the average pay average_pay_by_job_title = filtered_df.groupby('job_title')['annual_base_pay'].mean().reset_index() # Sort the dataframe by average pay in descending order average_pay_by_job_title = average_pay_by_job_title.sort_values(by='annual_base_pay', ascending=False) # Plot the bar graph plt.figure(figsize=(12, 6)) plt.bar(average_pay_by_job_title['job_title'], average_pay_by_job_title['annual_base_pay'], color='skyblue') plt.xlabel('Job Title') plt.ylabel('Average Annual Base Pay') plt.title(f'Average Pay for Top 10 Most Common Job Titles in {country_name}') plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better visibility plt.tight_layout() plt.show() # Assuming you have DataFrames ar_df, us_eu_df, and india_df plot_average_pay(ar_df, 'Argentina') plot_average_pay(us_eu_df, 'US/EU') plot_average_pay(india_df, 'India') #5 What is the difference between pay for men and women developers in Argentina? # Filter rows for the specified job titles and non-null gender identities selected_job_titles = ['Developer', 'UX Designer', 'Data Engineer', 'Project Manager', 'Consultant', 'QA / Tester'] # Filter the dataframe for the specified job titles and valid gender identities filtered_df = ar_df[ (ar_df['job_title'].isin(selected_job_titles)) & # Job title filter (ar_df['gender_identity'].isin(['Male', 'Female'])) # Gender filter ] # Create a pivot table comparing compensation for job titles by gender pivot_table = pd.pivot_table(filtered_df, values='annual_base_pay', index='job_title', columns='gender_identity', aggfunc='mean') # Plot the bar graph pivot_table.plot(kind='bar', figsize=(10, 6)) plt.xlabel('Job Title') plt.ylabel('Average Annual Base Pay') plt.title('Comparison of Compensation by Job Title in Argentina (Male vs. Female)') plt.legend(title='Gender', loc='upper right') plt.tight_layout() # Add space to the bottom plt.show() # Add a row showing the difference between men and women's salary pivot_table['Salary_Difference'] = pivot_table['Male'] - pivot_table['Female'] print("\n") # Display the pivot table print(pivot_table)