import pandas as pd
import numpy as np
import pycountry
import pycountry_convert as pc
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import f_oneway
import statsmodels.api as sm

# Read in the data
layoff_df = pd.read_csv('global_layoffs.csv')
layoff_df

# Select the columns we want to keep
layoff_refined = layoff_df[['country', 'date',
                            'total_laid_off', 'percentage_laid_off']]
layoff_refined

# Find missing values
layoff_refined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3313 entries, 0 to 3312
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   country              3313 non-null   object 
 1   date                 3313 non-null   object 
 2   total_laid_off       2189 non-null   float64
 3   percentage_laid_off  2141 non-null   float64
dtypes: float64(2), object(2)
memory usage: 103.7+ KB

# Drop missing values
layoff_refined = layoff_refined.dropna()

# Count the data for every country
layoff_refined["country"].value_counts()

country
United States           997
India                   107
Canada                   85
Brazil                   56
Israel                   52
United Kingdom           51
Germany                  49
Australia                29
Singapore                23
Sweden                   14
Indonesia                10
Netherlands               8
Kenya                     6
Nigeria                   6
France                    6
Ireland                   5
New Zealand               4
United Arab Emirates      4
Argentina                 4
Estonia                   4
Norway                    3
Hong Kong                 3
China                     3
Denmark                   2
Spain                     2
Austria                   2
Chile                     2
South Korea               2
Mexico                    2
Malaysia                  2
Portugal                  1
Switzerland               1
Bulgaria                  1
Seychelles                1
Russia                    1
Poland                    1
Finland                   1
Senegal                   1
Thailand                  1
Luxembourg                1
Saudi Arabia              1
Myanmar                   1
Name: count, dtype: int64

# Make a copy of the data frame
layoff_refined = layoff_refined.copy()

# Rename the columns to be more descriptive and consistent
layoff_refined.rename(columns={'country': 'Country', 'date': 'Date',
                      'total_laid_off': 'Total Layoffs', 'percentage_laid_off': 'Layoff Percentage'}, inplace=True)
layoff_refined

# Convert 'date' column to datetime if it's not already in datetime format
layoff_refined['Date'] = pd.to_datetime(layoff_refined['Date'])

# Extract the year from the 'date' column
layoff_refined['Year'] = layoff_refined['Date'].dt.year

# Filter the DataFrame to include only rows with years 2020, 2021, 2022, and 2023
layoff_filtered = layoff_refined[layoff_refined['Year'].isin(
    [2020, 2021, 2022, 2023])]
layoff_filtered

# Make a copy of the data frame
layoff_filtered = layoff_filtered.copy()

# Drop the Date column
layoff_filtered.drop(columns='Date', axis=1, inplace=True)
layoff_filtered

# Reset the index
layoff_filtered.reset_index(drop=True, inplace=True)
layoff_filtered

# Group the data by 'Country' and 'Year' and sum the 'Total Layoffs' for each group
layoff_counts = layoff_filtered.groupby(['Country', 'Year'])[
    'Total Layoffs'].sum()

# Convert the groupby object to a DataFrame
layoff_counts_df = pd.DataFrame(layoff_counts).reset_index()
layoff_counts_df

# Checking to see if there are any null values
layoff_counts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84 entries, 0 to 83
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Country        84 non-null     object 
 1   Year           84 non-null     int32  
 2   Total Layoffs  84 non-null     float64
dtypes: float64(1), int32(1), object(1)
memory usage: 1.8+ KB

# Pivot the data frame
layoff_final_df = layoff_counts_df.pivot(
    index='Country', columns='Year', values='Total Layoffs').dropna()
layoff_final_df

# Convert the data frame to a dictionary with Year and Country as keys
layoff_dict = layoff_final_df.to_dict(orient="index")

# Convert the dictionary values to lists
for key, value in layoff_dict.items():
    value["Year"] = list(value.keys())
    value["Layoffs"] = list(value.values())
    del value["Year"]
    del value["Layoffs"]
layoff_dict

# Pivot the data frame
layoff_final_df = layoff_counts_df.pivot(
    index="Country", columns="Year", values="Total Layoffs").dropna()

# Convert the data frame to a dictionary with Year and Country as keys
layoff_dict = layoff_final_df.to_dict(orient="index")

# Convert the dictionary values to lists
layoff_final_df = {"Year": list(layoff_dict.values())[0].keys()}
for key, value in layoff_dict.items():
    layoff_final_df[key] = list(value.values())
layoff_final_df = pd.DataFrame(layoff_final_df)

# Set the index to ‘Year’
layoff_final_df.set_index("Year", inplace=True)

# Create a figure and axis object
fig, ax = plt.subplots()

# Plot the histograms
for country in layoff_final_df.columns:
    ax.bar(layoff_final_df.index,
           layoff_final_df[country], label=country, alpha=0.5)

# Plot the line graphs
for country in layoff_final_df.columns:
    ax.plot(layoff_final_df.index, layoff_final_df[country], label=country)
ax.set_xlabel("Year")
ax.set_ylabel("Number of Layoffs")
ax.set_title("Number of Layoffs by Country")
ax.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.xticks(rotation=45)
plt.show()

# Read in the data
income_df = pd.read_csv('income_groups.csv')
income_df

# Pip install the pycountry library
!pip install pycountry

def country_code_to_name(country_code):
    '''Converts a country code to a country name'''
    # Use the pycountry library to convert the country code to a country name
    try:
        country = pycountry.countries.get(
            alpha_3=country_code)  # Get the country name
        return country.name
    except AttributeError:
        return "Unknown"

# Apply the function to the 'country_code' column
income_df['Country'] = income_df['country_code'].apply(country_code_to_name)
income_df

# Drop missing values
income_df.dropna(inplace=True)
income_df

# Find the "Unknown" values to replace with the correct country names
unknown_vales = income_df[income_df['Country'] == 'Unknown']
unknown_vales

# Replace "Unknown" values in the "Country" column based on different conditions
income_df.loc[(income_df['Country'] == 'Unknown') & (
    income_df['income_group'] == 'Upper middle income'), 'Country'] = 'Kosovo'
income_df.loc[(income_df['Country'] == 'Unknown') & (
    income_df['income_group'] == 'High income'), 'Country'] = 'Czech Republic,'
income_df

# Drop the "country_code" and "region" columns
income_df.drop(columns=['country_code', 'region'], inplace=True)
income_df

# Make a copy of the data frame
income_final_df = income_df.copy()

# Rename the columns to be more descriptive and consistent
income_final_df.rename(
    columns={'Country': 'Country', 'income_group': 'Income Group'}, inplace=True)

# Reindex the columns
income_final_df = income_final_df.reindex(columns=['Country', 'Income Group'])
income_final_df

# Visualize the income group data in a digestible format
income_final_df['Income Group'].value_counts().plot(kind='bar')
plt.title('Number of Countries in Each Income Group')
plt.xlabel('Income Group')
plt.ylabel('Number of Countries')
plt.xticks(rotation=45)
plt.show()

# Read in the data
gdpgrowth_df = pd.read_csv('gdp_growth_rate.csv')
gdpgrowth_df

# Find missing values
gdpgrowth_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231 entries, 0 to 230
Data columns (total 50 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Real GDP growth (Annual percent change)  229 non-null    object
 1   1980                                     228 non-null    object
 2   1981                                     228 non-null    object
 3   1982                                     228 non-null    object
 4   1983                                     228 non-null    object
 5   1984                                     228 non-null    object
 6   1985                                     228 non-null    object
 7   1986                                     228 non-null    object
 8   1987                                     228 non-null    object
 9   1988                                     228 non-null    object
 10  1989                                     228 non-null    object
 11  1990                                     228 non-null    object
 12  1991                                     228 non-null    object
 13  1992                                     228 non-null    object
 14  1993                                     228 non-null    object
 15  1994                                     228 non-null    object
 16  1995                                     228 non-null    object
 17  1996                                     228 non-null    object
 18  1997                                     228 non-null    object
 19  1998                                     228 non-null    object
 20  1999                                     228 non-null    object
 21  2000                                     228 non-null    object
 22  2001                                     228 non-null    object
 23  2002                                     228 non-null    object
 24  2003                                     228 non-null    object
 25  2004                                     228 non-null    object
 26  2005                                     228 non-null    object
 27  2006                                     228 non-null    object
 28  2007                                     228 non-null    object
 29  2008                                     228 non-null    object
 30  2009                                     228 non-null    object
 31  2010                                     228 non-null    object
 32  2011                                     228 non-null    object
 33  2012                                     228 non-null    object
 34  2013                                     228 non-null    object
 35  2014                                     228 non-null    object
 36  2015                                     228 non-null    object
 37  2016                                     228 non-null    object
 38  2017                                     228 non-null    object
 39  2018                                     228 non-null    object
 40  2019                                     228 non-null    object
 41  2020                                     228 non-null    object
 42  2021                                     228 non-null    object
 43  2022                                     228 non-null    object
 44  2023                                     228 non-null    object
 45  2024                                     228 non-null    object
 46  2025                                     228 non-null    object
 47  2026                                     228 non-null    object
 48  2027                                     228 non-null    object
 49  2028                                     228 non-null    object
dtypes: object(50)
memory usage: 90.4+ KB

# Select the columns we want to keep
gdpgrowth_recent = gdpgrowth_df[[
    "Real GDP growth (Annual percent change)", "2020", "2021", "2022", "2023"]]

# Drop missing values
gdpgrowth_recent = gdpgrowth_recent.dropna()
gdpgrowth_recent

# Replace "no data" with pd.NA
gdpgrowth_recent.replace("no data", pd.NA, inplace=True)
gdpgrowth_recent

# Make a copy of the data frame
gdpgrowth_country_final_df = gdpgrowth_recent.copy()

# Rename the columns to be more descriptive and consistent
gdpgrowth_country_final_df.rename(columns={"Real GDP growth (Annual percent change)": "Country", "2020": "Annual GDP Growth(2020)", "2021": "Annual GDP Growth(2021)",
                                           "2022": "Annual GDP Growth(2022)", "2023": "Annual GDP Growth(2023)"}, inplace=True)

# Reset the index
gdpgrowth_country_final_df.reset_index(drop=True, inplace=True)

# Select the rows from 1 to 195
gdpgrowth_country_final_df = gdpgrowth_country_final_df.iloc[1:195].copy()
gdpgrowth_country_final_df

# Convert the columns to numeric data types
gdpgrowth_country_final_df['Annual GDP Growth(2020)'] = pd.to_numeric(
    gdpgrowth_country_final_df['Annual GDP Growth(2020)'])
gdpgrowth_country_final_df['Annual GDP Growth(2021)'] = pd.to_numeric(
    gdpgrowth_country_final_df['Annual GDP Growth(2021)'])
gdpgrowth_country_final_df['Annual GDP Growth(2022)'] = pd.to_numeric(
    gdpgrowth_country_final_df['Annual GDP Growth(2022)'])
gdpgrowth_country_final_df['Annual GDP Growth(2023)'] = pd.to_numeric(
    gdpgrowth_country_final_df['Annual GDP Growth(2023)'])

# Sort the dataframe by each year's GDP growth rate in descending order
gdpgrowth_sorted = gdpgrowth_country_final_df.sort_values(
    by=['Annual GDP Growth(2020)', 'Annual GDP Growth(2021)', 'Annual GDP Growth(2022)', 'Annual GDP Growth(2023)'], ascending=False)

# Get the top ten countries with the highest GDP growth rate for each year
top_ten_2020 = gdpgrowth_sorted.head(10)['Country']
top_ten_2021 = gdpgrowth_sorted.head(10)['Country']
top_ten_2022 = gdpgrowth_sorted.head(10)['Country']
top_ten_2023 = gdpgrowth_sorted.head(10)['Country']

# Visualize the top ten countries with the highest GDP growth rate for each year
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 8))

# Plot for 2020
axes[0, 0].bar(top_ten_2020, gdpgrowth_sorted.head(10)
               ['Annual GDP Growth(2020)'])
axes[0, 0].set_xlabel('Country')
axes[0, 0].set_ylabel('GDP Growth Rate')
axes[0, 0].set_title('Top Ten Countries with Highest GDP Growth Rate in 2020')
axes[0, 0].tick_params(axis='x', rotation=45)

# Plot for 2021
axes[0, 1].bar(top_ten_2021, gdpgrowth_sorted.head(10)
               ['Annual GDP Growth(2021)'])
axes[0, 1].set_xlabel('Country')
axes[0, 1].set_ylabel('GDP Growth Rate')
axes[0, 1].set_title('Top Ten Countries with Highest GDP Growth Rate in 2021')
axes[0, 1].tick_params(axis='x', rotation=45)

# Plot for 2022
axes[1, 0].bar(top_ten_2022, gdpgrowth_sorted.head(10)
               ['Annual GDP Growth(2022)'])
axes[1, 0].set_xlabel('Country')
axes[1, 0].set_ylabel('GDP Growth Rate')
axes[1, 0].set_title('Top Ten Countries with Highest GDP Growth Rate in 2022')
axes[1, 0].tick_params(axis='x', rotation=45)

# Plot for 2023
axes[1, 1].bar(top_ten_2023, gdpgrowth_sorted.head(10)
               ['Annual GDP Growth(2023)'])
axes[1, 1].set_xlabel('Country')
axes[1, 1].set_ylabel('GDP Growth Rate')
axes[1, 1].set_title('Top Ten Countries with Highest GDP Growth Rate in 2023')
axes[1, 1].tick_params(axis='x', rotation=45)

# Adjust the layout
plt.tight_layout()
plt.show()

# Read in the data
happiness2020_df = pd.read_csv('world_happiness_report_2020.csv')
happiness2020_df

# Select the columns we want to keep
happiness2020_refined_df = happiness2020_df[["Country name", "Regional indicator", "Ladder score", "Logged GDP per capita", "Social support",
                                             "Healthy life expectancy", "Freedom to make life choices", "Perceptions of corruption"]]
happiness2020_refined_df

# Make a copy of the data frame
happiness2020_final_df = happiness2020_refined_df.copy()

# Drop missing values
happiness2020_final_df.dropna(inplace=True)

# Rename the columns to be more descriptive and consistent
happiness2020_final_df.rename(columns={"Country name": "Country", "Regional indicator": "Region", "Ladder score": "Happiness Score(2020)", "Logged GDP per capita": "GDP per Capita(2020)", "Social support": "Social Support(2020)",
                                       "Healthy life expectancy": "Life Expectancy(2020)", "Freedom to make life choices": "Freedom(2020)", "Perceptions of corruption": "Corruption(2020)"}, inplace=True)
happiness2020_final_df

# Visualize the happiness score 2020 data against all the other columns
happiness2020_final_df.hist(figsize=(15, 15))
plt.suptitle("Happiness Score 2020 Data Distribution")
plt.show()

# Read in the data
happiness2021_df = pd.read_csv('world_happiness_report_2021.csv')
happiness2021_df

# Select the columns we want to keep
happiness2021_refined_df = happiness2021_df[["Country name", "Regional indicator", "Ladder score", "Logged GDP per capita", "Social support",
                                             "Healthy life expectancy", "Freedom to make life choices", "Perceptions of corruption"]]
happiness2021_refined_df

# Make a copy of the data frame
happiness2021_final_df = happiness2021_refined_df.copy()

# Drop missing values
happiness2021_final_df.dropna(inplace=True)

# Rename the columns to be more descriptive and consistent
happiness2021_final_df.rename(columns={"Country name": "Country", "Regional indicator": "Region", "Ladder score": "Happiness Score(2021)", "Logged GDP per capita": "GDP per Capita(2021)", "Social support": "Social Support(2021)",
                                       "Healthy life expectancy": "Life Expectancy(2021)", "Freedom to make life choices": "Freedom(2021)", "Perceptions of corruption": "Corruption(2021)"}, inplace=True)
happiness2021_final_df

# Create a new data frame with the 'Country' and 'Region' columns
region_df = happiness2021_final_df[['Country', 'Region']].copy()
region_df

# Visualize the happiness score 2021 data against all the other columns
happiness2021_final_df.hist(figsize=(15, 15))
plt.suptitle("Happiness Score 2021 Data Distribution")
plt.show()

# Read in the data
happiness2022_df = pd.read_csv('world_happiness_report_2022.csv')
happiness2022_df

# Select the columns we want to keep
happiness2022_df = happiness2022_df[["Country", "Region", "Happiness Score", "Economy (GDP per Capita)", "Family (Social Support)",
                                     "Health (Life Expectancy)", "Freedom", "Trust (Government Corruption)"]]
happiness2022_df

# Make a copy of the data frame
happiness2022_final_df = happiness2022_df.copy()

# Drop missing values
happiness2022_final_df.dropna(inplace=True)

# Rename the columns to be more descriptive and consistent
happiness2022_final_df.rename(columns={"Happiness Score": "Happiness Score(2022)", "Economy (GDP per Capita)": "GDP per Capita(2022)", "Family (Social Support)": "Social Support(2022)",
                              "Health (Life Expectancy)": "Life Expectancy(2022)", "Freedom": "Freedom(2022)", "Trust (Government Corruption)": "Corruption(2022)"}, inplace=True)
happiness2022_final_df

# Convert the 'GDP per Capita(2022)' column to an integer
happiness2022_final_df["GDP per Capita(2022)"] = happiness2022_final_df["GDP per Capita(2022)"].str.replace(
    ",", "").astype(int)

# Log transform the 'GDP per Capita(2022)' column
happiness2022_final_df["Logged GDP per Capita(2022)"] = np.log2(
    happiness2022_final_df["GDP per Capita(2022)"])

# Drop the 'GDP per Capita(2022)' column
happiness2022_final_df.drop(columns=["GDP per Capita(2022)"], inplace=True)

# Rename the 'Logged GDP per Capita(2022)' column to 'GDP per Capita(2022)'
happiness2022_final_df.rename(
    columns={"Logged GDP per Capita(2022)": "GDP per Capita(2022)"}, inplace=True)
happiness2022_final_df

# Replace the commas with periods in every applicable column
happiness2022_final_df = happiness2022_final_df.replace(",", ".", regex=True)

# Convert the columns to float
happiness2022_final_df["Life Expectancy(2022)"] = happiness2022_final_df["Life Expectancy(2022)"].astype(
    float)

# Multiply the 'Life Expectancy(2022)' column by 100
happiness2022_final_df["Life Expectancy(2022)"] = happiness2022_final_df["Life Expectancy(2022)"] * 100
happiness2022_final_df

# Convert the 'Happiness Score(2022)', 'Freedom(2022)' and 'Corruption(2022)' columns to float
happiness2022_final_df["Happiness Score(2022)"] = happiness2022_final_df["Happiness Score(2022)"].astype(
    float)
happiness2022_final_df["Freedom(2022)"] = happiness2022_final_df["Freedom(2022)"].astype(
    float)
happiness2022_final_df["Corruption(2022)"] = happiness2022_final_df["Corruption(2022)"].astype(
    float)

# Visualize the Life Expectancy data
happiness2022_final_df["Life Expectancy(2022)"].plot(
    kind="hist", figsize=(8, 6), range=(0, 100))
plt.xlabel("Life Expectancy")
plt.ylabel("Frequency")
plt.title("Distribution of Life Expectancy in 2022")
plt.show()

# Visualize the Happiness Score
happiness2022_final_df["Happiness Score(2022)"].plot(
    kind="hist", figsize=(8, 6))
plt.xlabel("Happiness Score")
plt.ylabel("Frequency")
plt.title("Distribution of Happiness Score in 2022")
plt.show()

# Visualize the GDP per Capita
happiness2022_final_df["GDP per Capita(2022)"].plot(
    kind="hist", figsize=(8, 6), range=(8, 12))
plt.xlabel("GDP per Capita")
plt.ylabel("Frequency")
plt.title("Distribution of GDP per Capita in 2022")
plt.show()

# Visualize the Freedom data
happiness2022_final_df["Freedom(2022)"].plot(
    kind="hist", figsize=(8, 6), range=(0, 1))
plt.xlabel("Freedom")
plt.ylabel("Frequency")
plt.title("Distribution of Freedom in 2022")
plt.show()

# Visualize the Corruption data
happiness2022_final_df["Corruption(2022)"].plot(
    kind="hist", figsize=(8, 6), range=(0, 0.7))
plt.xlabel("Corruption")
plt.ylabel("Frequency")
plt.title("Distribution of Corruption in 2022")
plt.show()

# Read in the data
happiness2023_df = pd.read_csv('world_happiness_report_2023.csv')
happiness2023_df

# Pip install the pycountry-convert library
!pip install pycountry-convert

# Select the columns we want to keep
happiness2023_refined_df = happiness2023_df[["Country name", "Ladder score", "Logged GDP per capita", "Social support",
                                             "Healthy life expectancy", "Freedom to make life choices", "Perceptions of corruption"]]
happiness2023_refined_df

# Make a copy of the data frame
happiness2023_final_df = happiness2023_refined_df.copy()

# Drop missing values
happiness2023_final_df.dropna(inplace=True)

# Rename the columns to be more descriptive and consistent
happiness2023_final_df.rename(columns={"Country name": "Country", "Ladder score": "Happiness Score(2023)", "Logged GDP per capita": "GDP per Capita(2023)",
                                       "Social support": "Social Support(2023)", "Healthy life expectancy": "Life Expectancy(2023)", "Freedom to make life choices": "Freedom(2023)",
                                       "Perceptions of corruption": "Corruption(2023)"}, inplace=True)

# Merge the 'Region' column from the 'region_df' data frame with the 'happiness2023_final_df' data frame
happiness2023_final_df = happiness2023_final_df.merge(
    region_df, on='Country', how='left')
happiness2023_final_df

# Match the countries to their respective regions
region_mapping = {
    "Czechia": "Central and Eastern Europe",
    "State of Palestine": "Middle East and Northern Africa",
    "Turkiye": "Middle East and Northern Africa",
    "Congo (Kinshasa)": "Sub-Saharan Africa"
}

# Supply the region_mapping dictionary to the 'Region' column
happiness2023_final_df["Region"] = happiness2023_final_df["Region"].replace(
    region_mapping)

# Fill the missing values in the 'Region' column with the values from the 'Country' column
happiness2023_final_df["Region"] = happiness2023_final_df["Region"].fillna(
    happiness2023_final_df["Country"].map(region_mapping))
happiness2023_final_df

# Visualize the happiness score 2023 data against all the other columns
happiness2023_final_df.hist(figsize=(15, 15))
plt.suptitle("Happiness Score 2023 Data Distribution")
plt.show()

# Merge the three World Happiness Reports into one data frame
merged_happiness_df_1 = happiness2020_final_df.merge(
    happiness2021_final_df, on=["Country", "Region"], how="inner")
merged_happiness_df_2 = merged_happiness_df_1.merge(
    happiness2022_final_df, on=["Country", "Region"], how="inner")
merged_happiness_final_df = merged_happiness_df_2.merge(
    happiness2023_final_df, on=["Country", "Region"], how="inner")
merged_happiness_final_df

# Merge the happiness and income data frames
merged_happiness_income_df = merged_happiness_final_df.merge(
    income_final_df, on="Country", how="inner")
merged_happiness_income_df

# Merge the merged_happiness_income_df and gdpgrowth_country_final_df data frames into one data frame
merged_happiness_income_gdp_df = merged_happiness_income_df.merge(
    gdpgrowth_country_final_df, on=["Country"], how="inner")
merged_happiness_income_gdp_df

# Reindex the columns
merged_happiness_income_gdp_df = merged_happiness_income_gdp_df.reindex(columns=["Country", "Region", "Income Group",
                                                                                 "Happiness Score(2020)", "Happiness Score(2021)", "Happiness Score(2022)", "Happiness Score(2023)",
                                                                                 "GDP per Capita(2020)", "Annual GDP Growth(2020)",
                                                                                 "GDP per Capita(2021)", "Annual GDP Growth(2021)",
                                                                                 "GDP per Capita(2022)", "Annual GDP Growth(2022)",
                                                                                 "GDP per Capita(2023)", "Annual GDP Growth(2023)",
                                                                                 "Social Support(2020)", "Social Support(2021)", "Social Support(2022)", "Social Support(2023)",
                                                                                 "Life Expectancy(2020)", "Life Expectancy(2021)", "Life Expectancy(2022)", "Life Expectancy(2023)",
                                                                                 "Freedom(2020)", "Freedom(2021)", "Freedom(2022)", "Freedom(2023)",
                                                                                 "Corruption(2020)", "Corruption(2021)", "Corruption(2022)", "Corruption(2023)"])

# Convert the 'Happiness Score(2022)' column to float
merged_happiness_income_gdp_df["Happiness Score(2022)"] = merged_happiness_income_gdp_df["Happiness Score(2022)"].astype(
    float)
merged_happiness_income_gdp_df

# Visualize the merged data into four subplots showing the relationship between happiness score and GDP per capita for each year
fig, ax = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Happiness Score vs GDP per Capita (2020-2023)', fontsize=16)

# 2020
ax[0, 0].scatter(merged_happiness_income_gdp_df["GDP per Capita(2020)"],
                 merged_happiness_income_gdp_df["Happiness Score(2020)"], color='blue')
ax[0, 0].set_title('2020')
ax[0, 0].set_xlabel('GDP per Capita')
ax[0, 0].set_ylabel('Happiness Score')

# 2021
ax[0, 1].scatter(merged_happiness_income_gdp_df["GDP per Capita(2021)"],
                 merged_happiness_income_gdp_df["Happiness Score(2021)"], color='green')
ax[0, 1].set_title('2021')
ax[0, 1].set_xlabel('GDP per Capita')
ax[0, 1].set_ylabel('Happiness Score')

# 2022
ax[1, 0].scatter(merged_happiness_income_gdp_df["GDP per Capita(2022)"],
                 merged_happiness_income_gdp_df["Happiness Score(2022)"], color='red')
ax[1, 0].set_title('2022')
ax[1, 0].set_xlabel('GDP per Capita')
ax[1, 0].set_ylabel('Happiness Score')

# 2023
ax[1, 1].scatter(merged_happiness_income_gdp_df["GDP per Capita(2023)"],
                 merged_happiness_income_gdp_df["Happiness Score(2023)"], color='purple')
ax[1, 1].set_title('2023')
ax[1, 1].set_xlabel('GDP per Capita')
ax[1, 1].set_ylabel('Happiness Score')

plt.tight_layout()
plt.show()

# Print the final layoff data frame
layoff_final_df

# Compute descriptive statistics for the layoff data frame
layoff_final_df.describe()

# Function to calculate the mode
def calculate_mode(data):
    unique, counts = np.unique(data, return_counts=True)
    max_count_index = np.argmax(counts)
    return unique[max_count_index]


# Calculate descriptive statistics for each country's layoffs
stats_data = []

for country in layoff_final_df.columns:
    layoffs = layoff_final_df[country]
    stats_data.append({
        'Country': country,
        'Mean': np.mean(layoffs),
        'Median': np.median(layoffs),
        'Mode': calculate_mode(layoffs),
        'Range': np.max(layoffs) - np.min(layoffs),
        'Variance': np.var(layoffs),
        'Standard Deviation': np.std(layoffs)
    })

# Create a DataFrame from the collected statistics
stats_df = pd.DataFrame(stats_data)

# Display the DataFrame
stats_df

# Print the merged data frame
merged_happiness_income_gdp_df

# Compute descriptive statistics for the merged data frame
merged_happiness_income_gdp_df.describe()

# Extracting the years from the column names
years = [col.split('(')[-1].split(')')[0]
         for col in merged_happiness_income_gdp_df.columns if 'Happiness Score' in col]

# Creating a DataFrame to store descriptive statistics for each year
time_comparison = pd.DataFrame(index=years)

# Calculating descriptive statistics for each year
for year in years:
    happiness_scores = merged_happiness_income_gdp_df[f'Happiness Score({year})']
    time_comparison.loc[year, 'Mean'] = happiness_scores.mean()
    time_comparison.loc[year, 'Median'] = happiness_scores.median()
    # Mode can have multiple values, we choose the first one
    time_comparison.loc[year, 'Mode'] = happiness_scores.mode().iloc[0]
    time_comparison.loc[year, 'Range'] = happiness_scores.max() - \
        happiness_scores.min()
    time_comparison.loc[year, 'Variance'] = happiness_scores.var()
    time_comparison.loc[year, 'Standard Deviation'] = happiness_scores.std()

# Print or visualize the results
time_comparison

# Grouping the data by region
region_stats = merged_happiness_income_gdp_df.groupby('Region').agg({
    'Happiness Score(2020)': ['mean', 'median', lambda x: x.mode().iloc[0], lambda x: x.max() - x.min(), 'var', 'std'],
    'GDP per Capita(2020)': ['mean', 'median', lambda x: x.mode().iloc[0], lambda x: x.max() - x.min(), 'var', 'std'],
    'Life Expectancy(2020)': ['mean', 'median', lambda x: x.mode().iloc[0], lambda x: x.max() - x.min(), 'var', 'std'],
    'Freedom(2020)': ['mean', 'median', lambda x: x.mode().iloc[0], lambda x: x.max() - x.min(), 'var', 'std'],
    'Corruption(2020)': ['mean', 'median', lambda x: x.mode().iloc[0], lambda x: x.max() - x.min(), 'var', 'std']
})

# Renaming columns for better readability
region_stats.columns = ['Mean Happiness Score', 'Median Happiness Score', 'Mode Happiness Score', 'Range Happiness Score', 'Variance Happiness Score', 'Standard Deviation Happiness Score',
                        'Mean GDP per Capita', 'Median GDP per Capita', 'Mode GDP per Capita', 'Range GDP per Capita', 'Variance GDP per Capita', 'Standard Deviation GDP per Capita',
                        'Mean Life Expectancy', 'Median Life Expectancy', 'Mode Life Expectancy', 'Range Life Expectancy', 'Variance Life Expectancy', 'Standard Deviation Life Expectancy',
                        'Mean Freedom', 'Median Freedom', 'Mode Freedom', 'Range Freedom', 'Variance Freedom', 'Standard Deviation Freedom',
                        'Mean Corruption', 'Median Corruption', 'Mode Corruption', 'Range Corruption', 'Variance Corruption', 'Standard Deviation Corruption']

# Displaying the results
region_stats

# Creating a DataFrame to store descriptive statistics for GDP per Capita
gdp_stats = pd.DataFrame(index=years)

# Calculating descriptive statistics for GDP per Capita for each year
for year in years:
    gdp_values = merged_happiness_income_gdp_df[f'GDP per Capita({year})']
    gdp_stats.loc[year, 'Mean'] = gdp_values.mean()
    gdp_stats.loc[year, 'Median'] = gdp_values.median()
    # Mode can have multiple values, we choose the first one
    gdp_stats.loc[year, 'Mode'] = gdp_values.mode().iloc[0]
    gdp_stats.loc[year, 'Range'] = gdp_values.max() - gdp_values.min()
    gdp_stats.loc[year, 'Variance'] = gdp_values.var()
    gdp_stats.loc[year, 'Standard Deviation'] = gdp_values.std()

# Print or visualize the results
gdp_stats

# Creating a DataFrame to store descriptive statistics for Annual GDP Growth
gdp_growth_stats = pd.DataFrame(index=years)

# Calculating descriptive statistics for Annual GDP Growth for each year
for year in years:
    gdp_growth_values = merged_happiness_income_gdp_df[f'Annual GDP Growth({year})']
    gdp_growth_stats.loc[year, 'Mean'] = gdp_growth_values.mean()
    gdp_growth_stats.loc[year, 'Median'] = gdp_growth_values.median()
    # Mode can have multiple values, we choose the first one
    gdp_growth_stats.loc[year, 'Mode'] = gdp_growth_values.mode().iloc[0]
    gdp_growth_stats.loc[year, 'Range'] = gdp_growth_values.max(
    ) - gdp_growth_values.min()
    gdp_growth_stats.loc[year, 'Variance'] = gdp_growth_values.var()
    gdp_growth_stats.loc[year, 'Standard Deviation'] = gdp_growth_values.std()

# Print or visualize the results
gdp_growth_stats

# Creating a DataFrame to store descriptive statistics for Social Support
social_support_stats = pd.DataFrame(index=years)

# Calculating descriptive statistics for Social Support for each year
for year in years:
    social_support_values = pd.to_numeric(
        merged_happiness_income_gdp_df[f'Social Support({year})'], errors='coerce')
    social_support_stats.loc[year, 'Mean'] = social_support_values.mean()
    social_support_stats.loc[year, 'Median'] = social_support_values.median()
    # Mode can have multiple values, we choose the first one
    social_support_stats.loc[year,
                             'Mode'] = social_support_values.mode().iloc[0]
    social_support_stats.loc[year, 'Range'] = social_support_values.max(
    ) - social_support_values.min()
    social_support_stats.loc[year, 'Variance'] = social_support_values.var()
    social_support_stats.loc[year,
                             'Standard Deviation'] = social_support_values.std()

# Print or visualize the results
social_support_stats

# Creating a DataFrame to store descriptive statistics for Life Expectancy
life_expectancy_stats = pd.DataFrame(index=years)

# Calculating descriptive statistics for Life Expectancy for each year
for year in years:
    life_expectancy_values = merged_happiness_income_gdp_df[
        f'Life Expectancy({year})']
    life_expectancy_stats.loc[year, 'Mean'] = life_expectancy_values.mean()
    life_expectancy_stats.loc[year, 'Median'] = life_expectancy_values.median()
    # Mode can have multiple values, we choose the first one
    life_expectancy_stats.loc[year,
                              'Mode'] = life_expectancy_values.mode().iloc[0]
    life_expectancy_stats.loc[year, 'Range'] = life_expectancy_values.max(
    ) - life_expectancy_values.min()
    life_expectancy_stats.loc[year, 'Variance'] = life_expectancy_values.var()
    life_expectancy_stats.loc[year,
                              'Standard Deviation'] = life_expectancy_values.std()

# Print or visualize the results
life_expectancy_stats

# Creating a DataFrame to store descriptive statistics for Freedom
freedom_stats = pd.DataFrame(index=years)

# Calculating descriptive statistics for Freedom for each year
for year in years:
    freedom_values = merged_happiness_income_gdp_df[f'Freedom({year})']
    freedom_stats.loc[year, 'Mean'] = freedom_values.mean()
    freedom_stats.loc[year, 'Median'] = freedom_values.median()
    # Mode can have multiple values, we choose the first one
    freedom_stats.loc[year, 'Mode'] = freedom_values.mode().iloc[0]
    freedom_stats.loc[year, 'Range'] = freedom_values.max() - \
        freedom_values.min()
    freedom_stats.loc[year, 'Variance'] = freedom_values.var()
    freedom_stats.loc[year, 'Standard Deviation'] = freedom_values.std()

# Print or visualize the results
freedom_stats

# Creating a DataFrame to store descriptive statistics for Corruption
corruption_stats = pd.DataFrame(index=years)

# Calculating descriptive statistics for Corruption for each year
for year in years:
    corruption_values = merged_happiness_income_gdp_df[f'Corruption({year})']
    corruption_stats.loc[year, 'Mean'] = corruption_values.mean()
    corruption_stats.loc[year, 'Median'] = corruption_values.median()
    # Mode can have multiple values, we choose the first one
    corruption_stats.loc[year, 'Mode'] = corruption_values.mode().iloc[0]
    corruption_stats.loc[year, 'Range'] = corruption_values.max(
    ) - corruption_values.min()
    corruption_stats.loc[year, 'Variance'] = corruption_values.var()
    corruption_stats.loc[year, 'Standard Deviation'] = corruption_values.std()

# Print or visualize the results
corruption_stats

# Prepare the independent variables
independent_vars_2020 = ['GDP per Capita(2020)', 'Annual GDP Growth(2020)',
                         'Social Support(2020)', 'Life Expectancy(2020)', 'Freedom(2020)', 'Corruption(2020)']
X = merged_happiness_income_gdp_df[independent_vars_2020]
X = sm.add_constant(X)  # Add a constant term for the intercept

# Prepare the dependent variable
y = merged_happiness_income_gdp_df['Happiness Score(2020)']

# Fit the regression model
model2020 = sm.OLS(y, X)
results2020 = model2020.fit()

# Print the regression results
results2020.summary()

# Prepare the independent variables
independent_vars_2021 = ['GDP per Capita(2021)', 'Annual GDP Growth(2021)',
                         'Social Support(2021)', 'Life Expectancy(2021)', 'Freedom(2021)', 'Corruption(2021)']
X = merged_happiness_income_gdp_df[independent_vars_2021]
X = sm.add_constant(X)  # Add a constant term for the intercept

# Prepare the dependent variable
y = merged_happiness_income_gdp_df['Happiness Score(2020)']

# Fit the regression model
model = sm.OLS(y, X)
results2021 = model.fit()

# Print the regression results
results2021.summary()

# Prepare the independent variables
independent_vars_2022 = ['GDP per Capita(2022)', 'Social Support(2022)',
                         'Life Expectancy(2022)', 'Freedom(2022)', 'Corruption(2022)']
X = merged_happiness_income_gdp_df[independent_vars_2022].astype(float)

# Prepare the dependent variable
y = merged_happiness_income_gdp_df['Happiness Score(2022)'].astype(float)

# Add a constant term to the independent variables
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X)
results2022 = model.fit()

# Print the regression table
results2022.summary()

# Prepare the independent variables
independent_vars_2023 = ['GDP per Capita(2023)', 'Annual GDP Growth(2023)',
                         'Social Support(2023)', 'Life Expectancy(2023)', 'Freedom(2023)', 'Corruption(2023)']
X = merged_happiness_income_gdp_df[independent_vars_2023]
X = sm.add_constant(X)  # Add a constant term for the intercept

# Prepare the dependent variable
y = merged_happiness_income_gdp_df['Happiness Score(2020)']

# Fit the regression model
model = sm.OLS(y, X)
results2023 = model.fit()

# Print the regression results
results2023.summary()

# layoff_counts_df based on year
df_2020 = layoff_counts_df[layoff_counts_df['Year'] == 2020]
df_2021 = layoff_counts_df[layoff_counts_df['Year'] == 2021]
df_2022 = layoff_counts_df[layoff_counts_df['Year'] == 2022]
df_2023 = layoff_counts_df[layoff_counts_df['Year'] == 2023]

# Select the countries we want to analyze
countries_2020 = happiness2020_final_df[happiness2020_final_df['Country'].isin(
    ['Canada', 'Germany', 'India', 'Singapore', 'United States'])]
countries_2021 = happiness2021_final_df[happiness2021_final_df['Country'].isin(
    ['Canada', 'Germany', 'India', 'Singapore', 'United States'])]
countries_2022 = happiness2022_final_df[happiness2022_final_df['Country'].isin(
    ['Canada', 'Germany', 'India', 'Singapore', 'United States'])]
countries_2023 = happiness2023_final_df[happiness2023_final_df['Country'].isin(
    ['Canada', 'Germany', 'India', 'Singapore', 'United States'])]

countries_df = [countries_2020, countries_2021, countries_2022, countries_2023]

# Merging with layoff_counts_df and rename the Total Layoffs into Total Layoffs 2020, 2021, 2022, 2023
countries_2020 = countries_2020.merge(df_2020, on='Country', how='inner')
countries_2021 = countries_2021.merge(df_2021, on='Country', how='inner')
countries_2022 = countries_2022.merge(df_2022, on='Country', how='inner')
countries_2023 = countries_2023.merge(df_2023, on='Country', how='inner')

countries_2020 = countries_2020.rename(
    columns={'Total Layoffs': 'Total Layoffs(2020)'}).drop('Year', axis=1)
countries_2021 = countries_2021.rename(
    columns={'Total Layoffs': 'Total Layoffs(2021)'}).drop('Year', axis=1)
countries_2022 = countries_2022.rename(
    columns={'Total Layoffs': 'Total Layoffs(2022)'}).drop('Year', axis=1)
countries_2023 = countries_2023.rename(
    columns={'Total Layoffs': 'Total Layoffs(2023)'}).drop('Year', axis=1)


# Perform ANOVA on Total Layoffs and other numerical variables in countries_2020, 2021, 2022, 2023
f_value_2020, p_value_2020 = f_oneway(countries_2020['Total Layoffs(2020)'], countries_2020['GDP per Capita(2020)'], countries_2020['Social Support(2020)'],
                                      countries_2020['Freedom(2020)'], countries_2020['Life Expectancy(2020)'], countries_2020['Corruption(2020)'])
f_value_2021, p_value_2021 = f_oneway(countries_2021['Total Layoffs(2021)'], countries_2021['GDP per Capita(2021)'], countries_2021['Social Support(2021)'],
                                      countries_2021['Freedom(2021)'], countries_2021['Life Expectancy(2021)'], countries_2021['Corruption(2021)'])
f_value_2022, p_value_2022 = f_oneway(countries_2022['Total Layoffs(2022)'], countries_2022['GDP per Capita(2022)'], countries_2022['Social Support(2022)'],
                                      countries_2022['Freedom(2022)'], countries_2022['Life Expectancy(2022)'], countries_2022['Corruption(2022)'])
f_value_2023, p_value_2023 = f_oneway(countries_2023['Total Layoffs(2023)'], countries_2023['GDP per Capita(2023)'], countries_2023['Social Support(2023)'],
                                      countries_2023['Freedom(2023)'], countries_2023['Life Expectancy(2023)'], countries_2023['Corruption(2023)'])

# Print the ANOVA results for 2020, 2021, 2022, 2023
print("ANOVA Results for 2020:")
print("F-value:", f_value_2020)
print("p-value:", p_value_2020)
print("\nANOVA Results for 2021:")
print("F-value:", f_value_2021)
print("p-value:", p_value_2021)
print("\nANOVA Results for 2022:")
print("F-value:", f_value_2022)
print("p-value:", p_value_2022)
print("\nANOVA Results for 2023:")
print("F-value:", f_value_2023)
print("p-value:", p_value_2023)

# Calculate the correlation matrix
correlation_matrix_2020 = countries_2020[[
    'Total Layoffs(2020)', 'GDP per Capita(2020)', 'Social Support(2020)', 'Freedom(2020)', 'Life Expectancy(2020)', 'Corruption(2020)']].corr()
correlation_matrix_2021 = countries_2021[[
    'Total Layoffs(2021)', 'GDP per Capita(2021)', 'Social Support(2021)', 'Freedom(2021)', 'Life Expectancy(2021)', 'Corruption(2021)']].corr()
correlation_matrix_2022 = countries_2022[[
    'Total Layoffs(2022)', 'GDP per Capita(2022)', 'Social Support(2022)', 'Freedom(2022)', 'Life Expectancy(2022)', 'Corruption(2022)']].corr()
correlation_matrix_2023 = countries_2023[[
    'Total Layoffs(2023)', 'GDP per Capita(2023)', 'Social Support(2023)', 'Freedom(2023)', 'Life Expectancy(2023)', 'Corruption(2023)']].corr()


# Print the correlation matrix
print("Correlation Matrices:")
print(correlation_matrix_2020)
print(correlation_matrix_2021)
print(correlation_matrix_2022)
print(correlation_matrix_2023)

ANOVA Results for 2020:
F-value: 1.8790906275979398
p-value: 0.13555576682648315

ANOVA Results for 2021:
F-value: 1.1238950086445267
p-value: 0.3744596631376042

ANOVA Results for 2022:
F-value: 1.543434476380803
p-value: 0.2140153418208807

ANOVA Results for 2023:
F-value: 1.5210157521177274
p-value: 0.2206163657990928
Correlation Matrices:
                       Total Layoffs(2020)  GDP per Capita(2020)  \
Total Layoffs(2020)               1.000000              0.065442   
GDP per Capita(2020)              0.065442              1.000000   
Social Support(2020)              0.071777              0.952352   
Freedom(2020)                    -0.710499              0.196501   
Life Expectancy(2020)            -0.333122              0.914011   
Corruption(2020)                  0.564938             -0.722159   

                       Social Support(2020)  Freedom(2020)  \
Total Layoffs(2020)                0.071777      -0.710499   
GDP per Capita(2020)               0.952352       0.196501   
Social Support(2020)               1.000000       0.162974   
Freedom(2020)                      0.162974       1.000000   
Life Expectancy(2020)              0.876587       0.523780   
Corruption(2020)                  -0.602841      -0.724024   

                       Life Expectancy(2020)  Corruption(2020)  
Total Layoffs(2020)                -0.333122          0.564938  
GDP per Capita(2020)                0.914011         -0.722159  
Social Support(2020)                0.876587         -0.602841  
Freedom(2020)                       0.523780         -0.724024  
Life Expectancy(2020)               1.000000         -0.910427  
Corruption(2020)                   -0.910427          1.000000  
                       Total Layoffs(2021)  GDP per Capita(2021)  \
Total Layoffs(2021)               1.000000              0.208591   
GDP per Capita(2021)              0.208591              1.000000   
Social Support(2021)              0.242181              0.962650   
Freedom(2021)                    -0.833706              0.034346   
Life Expectancy(2021)            -0.221019              0.902336   
Corruption(2021)                  0.456462             -0.711489   

                       Social Support(2021)  Freedom(2021)  \
Total Layoffs(2021)                0.242181      -0.833706   
GDP per Capita(2021)               0.962650       0.034346   
Social Support(2021)               1.000000      -0.046770   
Freedom(2021)                     -0.046770       1.000000   
Life Expectancy(2021)              0.863107       0.428201   
Corruption(2021)                  -0.584593      -0.690791   

                       Life Expectancy(2021)  Corruption(2021)  
Total Layoffs(2021)                -0.221019          0.456462  
GDP per Capita(2021)                0.902336         -0.711489  
Social Support(2021)                0.863107         -0.584593  
Freedom(2021)                       0.428201         -0.690791  
Life Expectancy(2021)               1.000000         -0.905389  
Corruption(2021)                   -0.905389          1.000000  
                       Total Layoffs(2022)  GDP per Capita(2022)  \
Total Layoffs(2022)               1.000000              0.184229   
GDP per Capita(2022)              0.184229              1.000000   
Social Support(2022)              0.245647              0.966124   
Freedom(2022)                    -0.653289             -0.163263   
Life Expectancy(2022)            -0.328448              0.866672   
Corruption(2022)                 -0.487136              0.697504   

                       Social Support(2022)  Freedom(2022)  \
Total Layoffs(2022)                0.245647      -0.653289   
GDP per Capita(2022)               0.966124      -0.163263   
Social Support(2022)               1.000000      -0.226507   
Freedom(2022)                     -0.226507       1.000000   
Life Expectancy(2022)              0.809397       0.200977   
Corruption(2022)                   0.563576       0.475026   

                       Life Expectancy(2022)  Corruption(2022)  
Total Layoffs(2022)                -0.328448         -0.487136  
GDP per Capita(2022)                0.866672          0.697504  
Social Support(2022)                0.809397          0.563576  
Freedom(2022)                       0.200977          0.475026  
Life Expectancy(2022)               1.000000          0.917590  
Corruption(2022)                    0.917590          1.000000  
                       Total Layoffs(2023)  GDP per Capita(2023)  \
Total Layoffs(2023)               1.000000              0.209212   
GDP per Capita(2023)              0.209212              1.000000   
Social Support(2023)              0.296326              0.914891   
Freedom(2023)                    -0.892642             -0.486436   
Life Expectancy(2023)            -0.310975              0.857550   
Corruption(2023)                  0.474146             -0.744542   

                       Social Support(2023)  Freedom(2023)  \
Total Layoffs(2023)                0.296326      -0.892642   
GDP per Capita(2023)               0.914891      -0.486436   
Social Support(2023)               1.000000      -0.599106   
Freedom(2023)                     -0.599106       1.000000   
Life Expectancy(2023)              0.774587      -0.036943   
Corruption(2023)                  -0.547266      -0.203000   

                       Life Expectancy(2023)  Corruption(2023)  
Total Layoffs(2023)                -0.310975          0.474146  
GDP per Capita(2023)                0.857550         -0.744542  
Social Support(2023)                0.774587         -0.547266  
Freedom(2023)                      -0.036943         -0.203000  
Life Expectancy(2023)               1.000000         -0.941625  
Corruption(2023)                   -0.941625          1.000000

# Plot the first jointplot
sns.jointplot(x='Annual GDP Growth(2020)', y='Happiness Score(2020)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Plot the second jointplot
sns.jointplot(x='Annual GDP Growth(2021)', y='Happiness Score(2021)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Plot the third jointplot
sns.jointplot(x='Annual GDP Growth(2022)', y='Happiness Score(2022)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Plot the fourth jointplot
sns.jointplot(x='Annual GDP Growth(2023)', y='Happiness Score(2023)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Remove extra whitespace between subplots
plt.tight_layout()

# Show the plots
plt.show()

# Plot the first jointplot
sns.jointplot(x='Life Expectancy(2020)', y='Happiness Score(2020)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Plot the second jointplot
sns.jointplot(x='Life Expectancy(2021)', y='Happiness Score(2021)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Plot the third jointplot
sns.jointplot(x='Life Expectancy(2022)', y='Happiness Score(2022)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Plot the fourth jointplot
sns.jointplot(x='Life Expectancy(2023)', y='Happiness Score(2023)',
              data=merged_happiness_income_gdp_df, kind='scatter')

# Remove extra whitespace between subplots
plt.tight_layout()

# Show the plot
plt.show()

# Select the variables for the outlier plot
variables_2020 = ['GDP per Capita(2020)', 'Social Support(2020)',
                  'Life Expectancy(2020)', 'Freedom(2020)', 'Corruption(2020)']
variables_2021 = ['GDP per Capita(2021)', 'Social Support(2021)',
                  'Life Expectancy(2021)', 'Freedom(2021)', 'Corruption(2021)']
variables_2022 = ['GDP per Capita(2022)', 'Social Support(2022)',
                  'Life Expectancy(2022)', 'Freedom(2022)', 'Corruption(2022)']
variables_2023 = ['GDP per Capita(2023)', 'Social Support(2023)',
                  'Life Expectancy(2023)', 'Freedom(2023)', 'Corruption(2023)']

# Create subplots for all four box plots
fig, axs = plt.subplots(2, 2, figsize=(15, 10))

# Plot boxplots for each year
for i, year in enumerate([variables_2020, variables_2021, variables_2022, variables_2023]):
    row = i // 2
    col = i % 2
    merged_happiness_income_gdp_df[year].boxplot(ax=axs[row, col])
    axs[row, col].set_title(f'Outlier Plot - {years[i]}')
    axs[row, col].set_xlabel('Variables')
    axs[row, col].set_ylabel('Values')
    axs[row, col].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Loop through each year and generate residual plots
for year, independent_vars in zip(range(2020, 2024), [independent_vars_2020, independent_vars_2021, independent_vars_2022, independent_vars_2023]):
    X = merged_happiness_income_gdp_df[independent_vars].astype(
        float)  # Convert independent variables to float

    # Prepare the dependent variable
    y = merged_happiness_income_gdp_df[f'Happiness Score({year})'].astype(
        float)  # Convert dependent variable to float

    # Add a constant term to the independent variables
    X = sm.add_constant(X)

    # Fit the regression model
    model = sm.OLS(y, X)
    results = model.fit()

    # Generate predicted values
    predicted_values = results.predict(X)

    # Calculate residuals
    residuals = y - predicted_values

    # Create residual plot
    plt.figure(figsize=(8, 6))
    plt.scatter(predicted_values, residuals, alpha=0.5)
    # Add a horizontal line at y=0
    plt.axhline(y=0, color='red', linestyle='--')
    plt.title(f'Residual Plot for Year {year}')
    plt.xlabel('Predicted Values')
    plt.ylabel('Residuals')
    plt.grid(True)
    plt.show()

# Visualize the four correlation matrices with a different color scheme
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('Correlation Matrices (2020-2023)', fontsize=16)

# 2020
sns.heatmap(correlation_matrix_2020, annot=True, cmap='rocket', ax=axes[0, 0])
axes[0, 0].set_title('2020')

# 2021
sns.heatmap(correlation_matrix_2021, annot=True, cmap='rocket', ax=axes[0, 1])
axes[0, 1].set_title('2021')

# 2022
sns.heatmap(correlation_matrix_2022, annot=True, cmap='rocket', ax=axes[1, 0])
axes[1, 0].set_title('2022')

# 2023
sns.heatmap(correlation_matrix_2023, annot=True, cmap='rocket', ax=axes[1, 1])
axes[1, 1].set_title('2023')

# Adjust the layout
plt.tight_layout()
plt.show()

# Create a figure and four subplots
fig, axes = plt.subplots(2, 2, figsize=(10, 8))

sns.scatterplot(x="Income Group", y="Happiness Score(2020)",
                sizes=(10, 40), alpha=.5,
                data=merged_happiness_income_gdp_df, ax=axes[0, 0])
axes[0, 0].set_title('Happiness Score by Income group(2020)')
axes[0, 0].set_xlabel('Income Group')
axes[0, 0].set_ylabel('Happiness Score')

# Plot the second subplot
sns.scatterplot(x="Income Group", y="Happiness Score(2021)",
                sizes=(10, 40), alpha=.5,
                data=merged_happiness_income_gdp_df, ax=axes[0, 1])
axes[0, 1].set_title('Happiness Score by Income group(2021)')
axes[0, 1].set_xlabel('Income Group')
axes[0, 1].set_ylabel('Happiness Score')

# Plot the third subplot
sns.scatterplot(x="Income Group", y="Happiness Score(2022)",
                sizes=(10, 40), alpha=.5,
                data=merged_happiness_income_gdp_df, ax=axes[1, 0])
axes[1, 0].set_title('Happiness Score by Income group(2022)')
axes[1, 0].set_xlabel('Income Group')
axes[1, 0].set_ylabel('Happiness Score')

# Plot the fourth subplot
sns.scatterplot(x="Income Group", y="Happiness Score(2023)",
                sizes=(10, 40), alpha=.5,
                data=merged_happiness_income_gdp_df, ax=axes[1, 1])
axes[1, 1].set_title('Happiness Score by Income group(2023)')
axes[1, 1].set_xlabel('Income Group')
axes[1, 1].set_ylabel('Happiness Score')

# Rotate the x-axis labels for better readability
axes[0, 0].tick_params(axis='x', rotation=10)
axes[0, 1].tick_params(axis='x', rotation=10)
axes[1, 0].tick_params(axis='x', rotation=45)
axes[1, 1].tick_params(axis='x', rotation=45)

# Adjust spacing between subplots
plt.subplots_adjust(left=0.1, right=0.9, top=0.9,
                    bottom=0.1, wspace=0.5, hspace=0.4)

# Show the plots
plt.show()

# Create a figure and four subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Plot the first subplot
sns.barplot(x='Region', y='Happiness Score(2020)',
            data=merged_happiness_income_gdp_df, ax=axes[0, 0])
axes[0, 0].set_title('Difference in Happiness Score by Region (2020)')
axes[0, 0].set_xlabel('Region')
axes[0, 0].set_ylabel('Happiness Score')

# Plot the second subplot
sns.barplot(x='Region', y='Happiness Score(2021)',
            data=merged_happiness_income_gdp_df, ax=axes[0, 1])
axes[0, 1].set_title('Difference in Happiness Score by Region (2021)')
axes[0, 1].set_xlabel('Region')
axes[0, 1].set_ylabel('Happiness Score')

# Plot the third subplot
sns.barplot(x='Region', y='Happiness Score(2022)',
            data=merged_happiness_income_gdp_df, ax=axes[1, 0])
axes[1, 0].set_title('Difference in Happiness Score by Region (2022)')
axes[1, 0].set_xlabel('Region')
axes[1, 0].set_ylabel('Happiness Score')

# Plot the fourth subplot
sns.barplot(x='Region', y='Happiness Score(2023)',
            data=merged_happiness_income_gdp_df, ax=axes[1, 1])
axes[1, 1].set_title('Difference in Happiness Score by Region (2023)')
axes[1, 1].set_xlabel('Region')
axes[1, 1].set_ylabel('Happiness Score')

# Rotate the x-axis labels for better readability
for ax in axes.flat:
    ax.tick_params(axis='x', rotation=45)

# Adjust spacing between subplots
plt.tight_layout()

# Show the plots
plt.show()

# Reshape the DataFrame
merged_happiness_income_gdp_df_melted = pd.melt(merged_happiness_income_gdp_df, id_vars=['Region', 'Income Group'],
                                                value_vars=[
                                                    'Happiness Score(2020)', 'Happiness Score(2021)', 'Happiness Score(2022)', 'Happiness Score(2023)'],
                                                var_name='Year', value_name='Happiness Score')

# Extract the year from the ‘Year’ column and convert it to an integer
merged_happiness_income_gdp_df_melted['Year'] = merged_happiness_income_gdp_df_melted['Year'].str.extract(
    '(\d+)').astype(int)

# Create a line plot
fig, ax = plt.subplots(figsize=(10, 6))

# Loop through each region and income group and plot the data
for (region, income_group), group in merged_happiness_income_gdp_df_melted.groupby(['Region', 'Income Group']):
    group.plot(x='Year', y='Happiness Score', ax=ax,
               label=f"{region} - {income_group}", marker='o')

# Set the title, x-axis label, and y-axis label
plt.title('Happiness Score by Region and Income Group Over the Years')
plt.xlabel('Year')
plt.ylabel('Happiness Score')
plt.legend(title='Region - Income Group',
           bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Create a cross-tabulation to analyze the relationship between 'Region' and 'Income Group'
cross_tab = pd.crosstab(
    # Specify 'Region' as the rows (index)
    index=merged_happiness_income_gdp_df['Region'],
    # Specify 'Income Group' as the columns
    columns=merged_happiness_income_gdp_df['Income Group']
)
# Display the cross-tabulation results
cross_tab

# Create pivot tables for each year
pivot_table_2020 = pd.pivot_table(merged_happiness_income_gdp_df, index='Region',
                                  columns='Income Group', values='Happiness Score(2020)', aggfunc='mean', fill_value=0)
pivot_table_2021 = pd.pivot_table(merged_happiness_income_gdp_df, index='Region',
                                  columns='Income Group', values='Happiness Score(2021)', aggfunc='mean', fill_value=0)
pivot_table_2022 = pd.pivot_table(merged_happiness_income_gdp_df, index='Region',
                                  columns='Income Group', values='Happiness Score(2022)', aggfunc='mean', fill_value=0)
pivot_table_2023 = pd.pivot_table(merged_happiness_income_gdp_df, index='Region',
                                  columns='Income Group', values='Happiness Score(2023)', aggfunc='mean', fill_value=0)

# Combine the pivot tables into one
combined_pivot_table = pd.concat([pivot_table_2020, pivot_table_2021, pivot_table_2022,
                                 pivot_table_2023], axis=1, keys=['2020', '2021', '2022', '2023'])

# Display the combined pivot table
combined_pivot_table

# Create a pairplot to visualize the relationship between happiness scores
sns.pairplot(data=merged_happiness_income_gdp_df,
             hue="Region",
             vars=['Happiness Score(2020)', 'Happiness Score(2021)', 'Happiness Score(2022)', 'Happiness Score(2023)'])

<seaborn.axisgrid.PairGrid at 0x28fd09250>

# Display the merged data frame
categoried_happiness_income_gdp_df = merged_happiness_income_gdp_df.copy()

# Check for missing values
print(categoried_happiness_income_gdp_df.isnull().sum())

# Handle missing values (if any) by either dropping rows or imputing values
categoried_happiness_income_gdp_df.dropna(
    inplace=True)  # Drop rows with missing values

# Check for inconsistencies or outliers
categoried_happiness_income_gdp_df.describe()

Country                    0
Region                     0
Income Group               0
Happiness Score(2020)      0
Happiness Score(2021)      0
Happiness Score(2022)      0
Happiness Score(2023)      0
GDP per Capita(2020)       0
Annual GDP Growth(2020)    0
GDP per Capita(2021)       0
Annual GDP Growth(2021)    0
GDP per Capita(2022)       0
Annual GDP Growth(2022)    0
GDP per Capita(2023)       0
Annual GDP Growth(2023)    0
Social Support(2020)       0
Social Support(2021)       0
Social Support(2022)       0
Social Support(2023)       0
Life Expectancy(2020)      0
Life Expectancy(2021)      0
Life Expectancy(2022)      0
Life Expectancy(2023)      0
Freedom(2020)              0
Freedom(2021)              0
Freedom(2022)              0
Freedom(2023)              0
Corruption(2020)           0
Corruption(2021)           0
Corruption(2022)           0
Corruption(2023)           0
dtype: int64

# Feature scaling and normalization
from sklearn.preprocessing import StandardScaler

# Select numerical columns for scaling
numerical_columns = ['GDP per Capita(2020)', 'Annual GDP Growth(2020)', 'Social Support(2020)',
                     'Life Expectancy(2020)', 'Freedom(2020)', 'Corruption(2020)',
                     'GDP per Capita(2021)', 'Annual GDP Growth(2021)', 'Social Support(2021)',
                     'Life Expectancy(2021)', 'Freedom(2021)', 'Corruption(2021)',
                     'GDP per Capita(2022)', 'Annual GDP Growth(2022)', 'Social Support(2022)',
                     'Life Expectancy(2022)', 'Freedom(2022)', 'Corruption(2022)',
                     'GDP per Capita(2023)', 'Annual GDP Growth(2023)', 'Social Support(2023)',
                     'Life Expectancy(2023)', 'Freedom(2023)', 'Corruption(2023)'
                     ]

# Apply StandardScaler to normalize the data
scaler = StandardScaler()
categoried_happiness_income_gdp_df[numerical_columns] = scaler.fit_transform(
    categoried_happiness_income_gdp_df[numerical_columns])

from sklearn.preprocessing import LabelEncoder

# Define the columns containing categorical variables
categorical_columns = ['Country', 'Region', 'Income Group']

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_columns:
    categoried_happiness_income_gdp_df[col] = label_encoder.fit_transform(
        categoried_happiness_income_gdp_df[col])

# Display the updated DataFrame
categoried_happiness_income_gdp_df

# Dimensionality reduction using PCA
from sklearn.decomposition import PCA

# Select columns for PCA
pca_columns = ['GDP per Capita(2020)', 'Annual GDP Growth(2020)', 'Social Support(2020)',
               'Life Expectancy(2020)', 'Freedom(2020)', 'Corruption(2020)',
               'GDP per Capita(2021)', 'Annual GDP Growth(2021)', 'Social Support(2021)',
               'Life Expectancy(2021)', 'Freedom(2021)', 'Corruption(2021)',
               'GDP per Capita(2022)', 'Annual GDP Growth(2022)', 'Social Support(2022)',
               'Life Expectancy(2022)', 'Freedom(2022)', 'Corruption(2022)',
               'GDP per Capita(2023)', 'Annual GDP Growth(2023)', 'Social Support(2023)',
               'Life Expectancy(2023)', 'Freedom(2023)', 'Corruption(2023)'
               ]

# Apply PCA
# Adjust the number of components based on your needs
pca = PCA(n_components=3)
pca_components = pca.fit_transform(
    categoried_happiness_income_gdp_df[pca_columns])
pca_df = pd.DataFrame(pca_components, columns=['PC1', 'PC2', 'PC3'])

# Display the PCA DataFrame
pca_df

# Concatenate the PCA components with the original dataframe
categoried_happiness_income_gdp_df = pd.concat(
    [categoried_happiness_income_gdp_df, pca_df], axis=1)
categoried_happiness_income_gdp_df.drop(pca_columns, axis=1, inplace=True)

# Display the updated DataFrame
categoried_happiness_income_gdp_df

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Splitting the data into features and target variable
X = categoried_happiness_income_gdp_df.drop(
    ['Happiness Score(2023)'], axis=1)  # Features
# Target variable
y = categoried_happiness_income_gdp_df['Happiness Score(2023)']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Initialize regressors
logistic_regression = LinearRegression()
decision_tree = DecisionTreeRegressor()
random_forest = RandomForestRegressor()
svm_regressor = SVR()

# Train regressors on the training data
logistic_regression.fit(X_train, y_train)

LinearRegression()

LinearRegression()

# Train the decision tree regressor
decision_tree.fit(X_train, y_train)

DecisionTreeRegressor()

DecisionTreeRegressor()

# Train the random forest regressor
random_forest.fit(X_train, y_train)

RandomForestRegressor()

RandomForestRegressor()

# Train the SVM regressor
svm_regressor.fit(X_train, y_train)

SVR()

SVR()

# Make predictions on the test set
y_pred_lr = logistic_regression.predict(X_test)
y_pred_dt = decision_tree.predict(X_test)
y_pred_rf = random_forest.predict(X_test)
y_pred_svm = svm_regressor.predict(X_test)

# Evaluate regressor performance
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
mse_dt = mean_squared_error(y_test, y_pred_dt)
r2_dt = r2_score(y_test, y_pred_dt)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
mse_svm = mean_squared_error(y_test, y_pred_svm)
r2_svm = r2_score(y_test, y_pred_svm)

# Print results
print("Linear Regression:")
print(f"Mean Squared Error: {mse_lr:.2f}")
print(f"R-squared: {r2_lr:.2f}")
print("----------------------------------------------------")
print("Decision Tree:")
print(f"Mean Squared Error: {mse_dt:.2f}")
print(f"R-squared: {r2_dt:.2f}")
print("----------------------------------------------------")
print("Random Forest:")
print(f"Mean Squared Error: {mse_rf:.2f}")
print(f"R-squared: {r2_rf:.2f}")
print("----------------------------------------------------")
print("Support Vector Machine:")
print(f"Mean Squared Error: {mse_svm:.2f}")
print(f"R-squared: {r2_svm:.2f}")

Linear Regression:
Mean Squared Error: 0.05
R-squared: 0.97
----------------------------------------------------
Decision Tree:
Mean Squared Error: 0.10
R-squared: 0.93
----------------------------------------------------
Random Forest:
Mean Squared Error: 0.09
R-squared: 0.94
----------------------------------------------------
Support Vector Machine:
Mean Squared Error: 0.55
R-squared: 0.65

from sklearn.model_selection import GridSearchCV

# Define parameter grids for each model
param_grid_dt = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Perform grid search cross-validation for each model
grid_search_dt = GridSearchCV(
    decision_tree, param_grid_dt, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_rf = GridSearchCV(
    random_forest, param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_svm = GridSearchCV(
    svm_regressor, param_grid_svm, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the grid search objects
grid_search_dt.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)
grid_search_svm.fit(X_train, y_train)

# Get the best hyperparameters and best estimators
best_params_dt = grid_search_dt.best_params_
best_estimator_dt = grid_search_dt.best_estimator_

best_params_rf = grid_search_rf.best_params_
best_estimator_rf = grid_search_rf.best_estimator_

best_params_svm = grid_search_svm.best_params_
best_estimator_svm = grid_search_svm.best_estimator_

# Print the best hyperparameters for each model
print("Best hyperparameters for Decision Tree:", best_params_dt)
print("Best hyperparameters for Random Forest:", best_params_rf)
print("Best hyperparameters for Support Vector Machine:", best_params_svm)

Best hyperparameters for Decision Tree: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best hyperparameters for Random Forest: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best hyperparameters for Support Vector Machine: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}

# Use the best estimators for predictions
y_pred_dt_tuned = best_estimator_dt.predict(X_test)
y_pred_rf_tuned = best_estimator_rf.predict(X_test)
y_pred_svm_tuned = best_estimator_svm.predict(X_test)

# Evaluate performance
mse_dt_tuned = mean_squared_error(y_test, y_pred_dt_tuned)
r2_dt_tuned = r2_score(y_test, y_pred_dt_tuned)

mse_rf_tuned = mean_squared_error(y_test, y_pred_rf_tuned)
r2_rf_tuned = r2_score(y_test, y_pred_rf_tuned)

mse_svm_tuned = mean_squared_error(y_test, y_pred_svm_tuned)
r2_svm_tuned = r2_score(y_test, y_pred_svm_tuned)

# Print results
print("Tuned Decision Tree:")
print(f"Mean Squared Error: {mse_dt_tuned:.2f}")
print(f"R-squared: {r2_dt_tuned:.2f}")
print("----------------------------------------------------")
print("Tuned Random Forest:")
print(f"Mean Squared Error: {mse_rf_tuned:.2f}")
print(f"R-squared: {r2_rf_tuned:.2f}")
print("----------------------------------------------------")
print("Tuned Support Vector Machine:")
print(f"Mean Squared Error: {mse_svm_tuned:.2f}")
print(f"R-squared: {r2_svm_tuned:.2f}")

Tuned Decision Tree:
Mean Squared Error: 0.16
R-squared: 0.90
----------------------------------------------------
Tuned Random Forest:
Mean Squared Error: 0.08
R-squared: 0.95
----------------------------------------------------
Tuned Support Vector Machine:
Mean Squared Error: 0.06
R-squared: 0.96

# Define a function to create scatter plots
def plot_scatter(y_true, y_pred, model_name):
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, y_pred, color='blue', alpha=0.5)
    plt.plot([y_true.min(), y_true.max()], [
             y_true.min(), y_true.max()], 'k--', lw=2)
    plt.xlabel('True Happiness Score')
    plt.ylabel('Predicted Happiness Score')
    plt.title(f'{model_name} - True vs Predicted Happiness Score')
    plt.grid(True)
    plt.show()


# Plot scatter plots for each model
plot_scatter(y_test, y_pred_lr, 'Linear Regression')
plot_scatter(y_test, y_pred_dt, 'Decision Tree')
plot_scatter(y_test, y_pred_rf, 'Random Forest')
plot_scatter(y_test, y_pred_svm, 'Support Vector Machine')

from sklearn.ensemble import VotingRegressor

# Prepare the data for predicting the happiness score for 2024
X_2024 = categoried_happiness_income_gdp_df.drop(
    ['Happiness Score(2023)'], axis=1)

# Create a voting regressor with the trained models
voting_regressor = VotingRegressor([('lr', logistic_regression),
                                    ('dt', decision_tree),
                                    ('rf', random_forest),
                                    ('svm', svm_regressor)])

# Fit the voting regressor on the training data
voting_regressor.fit(X_train, y_train)

# Use the voting regressor to predict the happiness score for 2024
y_pred_ensemble_2024 = pd.DataFrame(voting_regressor.predict(X_2024))

# Print the ensemble predictions for 2024
y_pred_ensemble_2024

# Create a DataFrame with country names and ensemble predictions
predictions_df = pd.DataFrame({'Country': merged_happiness_income_gdp_df['Country'],
                               'Happiness Score(2024)': voting_regressor.predict(X_2024)})

# Save the DataFrame to a CSV file
predictions_df.to_csv('happiness_predictions_2024.csv', index=False)

# Print a success message
print("Predictions saved to 'happiness_predictions_2024.csv' successfully.")
predictions_df

Predictions saved to 'happiness_predictions_2024.csv' successfully.

# Importing the real happiness score for 2024
real_happiness_2024 = pd.read_csv('world_happiness_report_2024.csv')
real_happiness_2024 = real_happiness_2024[['Country name', 'Ladder score']]
real_happiness_2024 = real_happiness_2024.rename(
    columns={'Country name': 'Country', 'Ladder score': 'Happiness Score(2024)'})
real_happiness_2024

# Merge prediction data frame and real dataset based on the 'Country' column
merged_df = pd.merge(predictions_df, real_happiness_2024,
                     on='Country', suffixes=('_predicted', '_actual'))

# Calculate the number of common countries
common_countries_count = merged_df.shape[0]

# Calculate the number of correctly predicted countries
correctly_predicted_count = (
    merged_df['Happiness Score(2024)_predicted'] == merged_df['Happiness Score(2024)_actual']).sum()

# Calculate simple accuracy
accuracy = correctly_predicted_count / common_countries_count

# Display the merged data frame
print("Merged DataFrame with common countries:")
merged_df

Merged DataFrame with common countries:

# Define a threshold for accuracy
threshold = 0.3

# Calculate the absolute difference between predicted and actual scores
merged_df['Score_Difference'] = abs(
    merged_df['Happiness Score(2024)_predicted'] - merged_df['Happiness Score(2024)_actual'])

# Count the number of accurate predictions within the threshold
accurate_predictions_count = (merged_df['Score_Difference'] <= threshold).sum()

# Calculate accuracy
accuracy = accurate_predictions_count / merged_df.shape[0]

# Display the accuracy
print("Accuracy:", accuracy)

Accuracy: 0.7948717948717948

from scipy.stats import spearmanr

# Calculate Spearman correlation coefficient for each column
spearman_correlation, p_value = spearmanr(
    merged_df['Happiness Score(2024)_predicted'], merged_df['Happiness Score(2024)_actual'])

# Display Spearman correlation coefficient
print("Spearman Correlation Coefficient:", spearman_correlation)

Spearman Correlation Coefficient: 0.9779713956929147

# Calculate ranks for predicted and actual happiness scores
merged_df['Rank_Predicted'] = merged_df['Happiness Score(2024)_predicted'].rank(
    ascending=False)
merged_df['Rank_Actual'] = merged_df['Happiness Score(2024)_actual'].rank(
    ascending=False)

# Calculate the absolute differences between the ranks
merged_df['Rank_Difference'] = abs(
    merged_df['Rank_Predicted'] - merged_df['Rank_Actual'])

merged_df

# Calculate Mean Absolute Error (MAE)
mae = (merged_df['Rank_Predicted'] - merged_df['Rank_Actual']).abs().mean()
print("Mean Absolute Error (MAE):", mae)

Mean Absolute Error (MAE): 3.3333333333333335

# Calculate Root Mean Squared Error (RMSE)
rmse = ((merged_df['Rank_Predicted'] -
        merged_df['Rank_Actual']) ** 2).mean() ** 0.5
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 4.725815626252608

	company	location	industry	total_laid_off	percentage_laid_off	date	stage	country	funds_raised
0	New Work	Hamburg	Consumer	400.0	NaN	2024-01-11	Post-IPO	Germany	NaN
1	Playtika	Tel Aviv	Consumer	300.0	0.10	2024-01-11	Post-IPO	Israel	NaN
2	Discord	SF Bay Area	Consumer	170.0	0.17	2024-01-11	Series H	United States	995.0
3	Inmobi	Bengaluru	Marketing	125.0	0.05	2024-01-11	Unknown	India	320.0
4	Audible	New York City	Media	100.0	0.05	2024-01-11	Acquired	United States	14.0
...	...	...	...	...	...	...	...	...	...
3308	Service	Los Angeles	Travel	NaN	1.00	2020-03-16	Seed	United States	5.1
3309	HopSkipDrive	Los Angeles	Transportation	8.0	0.10	2020-03-13	Unknown	United States	45.0
3310	Panda Squad	SF Bay Area	Consumer	6.0	0.75	2020-03-13	Seed	United States	1.0
3311	Tamara Mellon	Los Angeles	Retail	20.0	0.40	2020-03-12	Series C	United States	90.0
3312	EasyPost	Salt Lake City	Logistics	75.0	NaN	2020-03-11	Series A	United States	12.0

	country	date	total_laid_off	percentage_laid_off
0	Germany	2024-01-11	400.0	NaN
1	Israel	2024-01-11	300.0	0.10
2	United States	2024-01-11	170.0	0.17
3	India	2024-01-11	125.0	0.05
4	United States	2024-01-11	100.0	0.05
...	...	...	...	...
3308	United States	2020-03-16	NaN	1.00
3309	United States	2020-03-13	8.0	0.10
3310	United States	2020-03-13	6.0	0.75
3311	United States	2020-03-12	20.0	0.40
3312	United States	2020-03-11	75.0	NaN

	Country	Date	Total Layoffs	Layoff Percentage
1	Israel	2024-01-11	300.0	0.10
2	United States	2024-01-11	170.0	0.17
3	India	2024-01-11	125.0	0.05
4	United States	2024-01-11	100.0	0.05
5	United States	2024-01-11	60.0	0.13
...	...	...	...	...
3306	United States	2020-03-16	130.0	0.22
3307	United States	2020-03-16	16.0	1.00
3309	United States	2020-03-13	8.0	0.10
3310	United States	2020-03-13	6.0	0.75
3311	United States	2020-03-12	20.0	0.40

Year	2020	2021	2022	2023
Country
Canada	1141.0	45.0	3185.0	4341.0
Germany	331.0	87.0	2424.0	12278.0
India	9472.0	200.0	9121.0	6984.0
Singapore	2361.0	21.0	3484.0	1169.0
United States	45082.0	6150.0	97176.0	136900.0

	country_code	region	income_group
0	ABW	Latin America & Caribbean	High income
1	AFG	South Asia	Low income
2	AGO	Sub-Saharan Africa	Lower middle income
3	ALB	Europe & Central Asia	Upper middle income
4	AND	Europe & Central Asia	High income
...	...	...	...
212	XKX	Europe & Central Asia	Upper middle income
213	YEM	Middle East & North Africa	Low income
214	ZAF	Sub-Saharan Africa	Upper middle income
215	ZMB	Sub-Saharan Africa	Lower middle income
216	ZWE	Sub-Saharan Africa	Lower middle income

	Country	Date	Total Layoffs	Layoff Percentage	Year
41	India	2023-12-20	200.0	0.15	2023
42	United States	2023-12-19	100.0	0.20	2023
44	United States	2023-12-18	350.0	0.10	2023
45	India	2023-12-18	100.0	0.10	2023
51	United States	2023-12-14	900.0	0.24	2023
...	...	...	...	...	...
3306	United States	2020-03-16	130.0	0.22	2020
3307	United States	2020-03-16	16.0	1.00	2020
3309	United States	2020-03-13	8.0	0.10	2020
3310	United States	2020-03-13	6.0	0.75	2020
3311	United States	2020-03-12	20.0	0.40	2020

	Country	Year	Total Layoffs
0	Argentina	2022	283.0
1	Australia	2020	96.0
2	Australia	2022	1088.0
3	Australia	2023	1702.0
4	Austria	2022	270.0
...	...	...	...
79	United Kingdom	2023	6818.0
80	United States	2020	45082.0
81	United States	2021	6150.0
82	United States	2022	97176.0
83	United States	2023	136900.0

	country_code	region	income_group	Country
0	ABW	Latin America & Caribbean	High income	Aruba
1	AFG	South Asia	Low income	Afghanistan
2	AGO	Sub-Saharan Africa	Lower middle income	Angola
3	ALB	Europe & Central Asia	Upper middle income	Albania
4	AND	Europe & Central Asia	High income	Andorra
...	...	...	...	...
212	XKX	Europe & Central Asia	Upper middle income	Unknown
213	YEM	Middle East & North Africa	Low income	Yemen
214	ZAF	Sub-Saharan Africa	Upper middle income	South Africa
215	ZMB	Sub-Saharan Africa	Lower middle income	Zambia
216	ZWE	Sub-Saharan Africa	Lower middle income	Zimbabwe

	Real GDP growth (Annual percent change)	1980	1981	1982	1983	1984	1985	1986	1987	1988	...	2019	2020	2021	2022	2023	2024	2025	2026	2027	2028
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Afghanistan	no data	no data	no data	no data	no data	no data	no data	no data	no data	...	3.9	-2.4	-20.7	no data	no data	no data	no data	no data	no data	no data
2	Albania	2.7	5.7	2.9	1.1	2	-1.5	5.6	-0.8	-1.4	...	2.1	-3.3	8.9	4.8	3.6	3.3	3.4	3.5	3.5	3.5
3	Algeria	-5.4	3	6.4	5.4	5.6	5.6	-0.2	-0.7	-1.9	...	1	-5.1	3.4	3.2	3.8	3.1	2.5	1.9	1.7	1.7
4	Andorra	no data	no data	no data	no data	no data	no data	no data	no data	no data	...	2	-11.2	8.3	8.8	2.1	1.5	1.5	1.5	1.5	1.5
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
226	Other advanced economies	3.8	4.2	2.1	4	6.6	4.3	5.5	6.5	5.5	...	2	-1.6	5.7	2.6	1.8	2.2	2.3	2.3	2.2	2.2
227	Sub-Saharan Africa	no data	no data	no data	no data	no data	no data	no data	no data	no data	...	3.2	-1.6	4.7	4	3.3	4	4.1	4.1	4.2	4.3
228	World	2.2	2.1	0.7	2.6	4.6	3.6	3.6	3.9	4.7	...	2.8	-2.8	6.3	3.5	3	2.9	3.2	3.2	3.1	3.1
229	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
230	©IMF, 2023	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Country name	Regional indicator	Ladder score	Standard error of ladder score	upperwhisker	lowerwhisker	Logged GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption	Ladder score in Dystopia	Explained by: Log GDP per capita	Explained by: Social support	Explained by: Healthy life expectancy	Explained by: Freedom to make life choices	Explained by: Generosity	Explained by: Perceptions of corruption	Dystopia + residual
0	Finland	Western Europe	7.8087	0.031156	7.869766	7.747634	10.639267	0.954330	71.900825	0.949172	-0.059482	0.195445	1.972317	1.285190	1.499526	0.961271	0.662317	0.159670	0.477857	2.762835
1	Denmark	Western Europe	7.6456	0.033492	7.711245	7.579955	10.774001	0.955991	72.402504	0.951444	0.066202	0.168489	1.972317	1.326949	1.503449	0.979333	0.665040	0.242793	0.495260	2.432741
2	Switzerland	Western Europe	7.5599	0.035014	7.628528	7.491272	10.979933	0.942847	74.102448	0.921337	0.105911	0.303728	1.972317	1.390774	1.472403	1.040533	0.628954	0.269056	0.407946	2.350267
3	Iceland	Western Europe	7.5045	0.059616	7.621347	7.387653	10.772559	0.974670	73.000000	0.948892	0.246944	0.711710	1.972317	1.326502	1.547567	1.000843	0.661981	0.362330	0.144541	2.460688
4	Norway	Western Europe	7.4880	0.034837	7.556281	7.419719	11.087804	0.952487	73.200783	0.955750	0.134533	0.263218	1.972317	1.424207	1.495173	1.008072	0.670201	0.287985	0.434101	2.168266
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
148	Central African Republic	Sub-Saharan Africa	3.4759	0.115183	3.701658	3.250141	6.625160	0.319460	45.200001	0.640881	0.082410	0.891807	1.972317	0.041072	0.000000	0.000000	0.292814	0.253513	0.028265	2.860198
149	Rwanda	Sub-Saharan Africa	3.3123	0.052425	3.415053	3.209547	7.600104	0.540835	61.098846	0.900589	0.055484	0.183541	1.972317	0.343243	0.522876	0.572383	0.604088	0.235705	0.485542	0.548445
150	Zimbabwe	Sub-Saharan Africa	3.2992	0.058674	3.414202	3.184198	7.865712	0.763093	55.617260	0.711458	-0.072064	0.810237	1.972317	0.425564	1.047835	0.375038	0.377405	0.151349	0.080929	0.841031
151	South Sudan	Sub-Saharan Africa	2.8166	0.107610	3.027516	2.605684	7.425360	0.553707	51.000000	0.451314	0.016519	0.763417	1.972317	0.289083	0.553279	0.208809	0.065609	0.209935	0.111157	1.378751
152	Afghanistan	South Asia	2.5669	0.031311	2.628270	2.505530	7.462861	0.470367	52.590000	0.396573	-0.096429	0.933687	1.972317	0.300706	0.356434	0.266052	0.000000	0.135235	0.001226	1.507236

	Country name	Regional indicator	Ladder score	Standard error of ladder score	upperwhisker	lowerwhisker	Logged GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption	Ladder score in Dystopia	Explained by: Log GDP per capita	Explained by: Social support	Explained by: Healthy life expectancy	Explained by: Freedom to make life choices	Explained by: Generosity	Explained by: Perceptions of corruption	Dystopia + residual
0	Finland	Western Europe	7.842	0.032	7.904	7.780	10.775	0.954	72.000	0.949	-0.098	0.186	2.43	1.446	1.106	0.741	0.691	0.124	0.481	3.253
1	Denmark	Western Europe	7.620	0.035	7.687	7.552	10.933	0.954	72.700	0.946	0.030	0.179	2.43	1.502	1.108	0.763	0.686	0.208	0.485	2.868
2	Switzerland	Western Europe	7.571	0.036	7.643	7.500	11.117	0.942	74.400	0.919	0.025	0.292	2.43	1.566	1.079	0.816	0.653	0.204	0.413	2.839
3	Iceland	Western Europe	7.554	0.059	7.670	7.438	10.878	0.983	73.000	0.955	0.160	0.673	2.43	1.482	1.172	0.772	0.698	0.293	0.170	2.967
4	Netherlands	Western Europe	7.464	0.027	7.518	7.410	10.932	0.942	72.400	0.913	0.175	0.338	2.43	1.501	1.079	0.753	0.647	0.302	0.384	2.798
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
144	Lesotho	Sub-Saharan Africa	3.512	0.120	3.748	3.276	7.926	0.787	48.700	0.715	-0.131	0.915	2.43	0.451	0.731	0.007	0.405	0.103	0.015	1.800
145	Botswana	Sub-Saharan Africa	3.467	0.074	3.611	3.322	9.782	0.784	59.269	0.824	-0.246	0.801	2.43	1.099	0.724	0.340	0.539	0.027	0.088	0.648
146	Rwanda	Sub-Saharan Africa	3.415	0.068	3.548	3.282	7.676	0.552	61.400	0.897	0.061	0.167	2.43	0.364	0.202	0.407	0.627	0.227	0.493	1.095
147	Zimbabwe	Sub-Saharan Africa	3.145	0.058	3.259	3.030	7.943	0.750	56.201	0.677	-0.047	0.821	2.43	0.457	0.649	0.243	0.359	0.157	0.075	1.205
148	Afghanistan	South Asia	2.523	0.038	2.596	2.449	7.695	0.463	52.493	0.382	-0.102	0.924	2.43	0.370	0.000	0.126	0.000	0.122	0.010	1.895

	Unnamed: 0	Happiness Rank	Country	Region	Happiness Score	Economy (GDP per Capita)	Family (Social Support)	Health (Life Expectancy)	Freedom	Trust (Government Corruption)	Generosity	Year
0	0	1	Finland	Western Europe	7,821	1,892	1,258	0,775	0,736	0,534	0,109	2022
1	1	2	Denmark	Western Europe	7,636	1,953	1,243	0,777	0,719	0,532	0,188	2022
2	2	3	Iceland	Western Europe	7,557	1,936	1,320	0,803	0,718	0,191	0,270	2022
3	3	4	Switzerland	Western Europe	7,512	2,026	1,226	0,822	0,677	0,461	0,147	2022
4	4	5	Netherlands	Western Europe	7,415	1,945	1,206	0,787	0,651	0,419	0,271	2022
...	...	...	...	...	...	...	...	...	...	...	...	...
140	141	142	Botswana	Sub-Saharan Africa	3,471	1,503	0,815	0,280	0,571	0,102	0,012	2022
141	142	143	Rwanda	Sub-Saharan Africa	3,268	0,785	0,133	0,462	0,621	0,544	0,187	2022
142	143	144	Zimbabwe	Sub-Saharan Africa	2,995	0,947	0,690	0,270	0,329	0,105	0,106	2022
143	144	145	Lebanon	Middle East and Northern Africa	2,955	1,392	0,498	0,631	0,103	0,034	0,082	2022
144	145	146	Afghanistan	Southern Asia	2,404	0,758	0,000	0,289	0,000	0,005	0,089	2022

	Country	Region	Happiness Score(2022)	Social Support(2022)	Life Expectancy(2022)	Freedom(2022)	Corruption(2022)	GDP per Capita(2022)
0	Finland	Western Europe	7.821	1.258	77.5	0.736	0.534	10.885696
1	Denmark	Western Europe	7.636	1.243	77.7	0.719	0.532	10.931476
2	Iceland	Western Europe	7.557	1.320	80.3	0.718	0.191	10.918863
3	Switzerland	Western Europe	7.512	1.226	82.2	0.677	0.461	10.984418
4	Netherlands	Western Europe	7.415	1.206	78.7	0.651	0.419	10.925554
...	...	...	...	...	...	...	...	...
140	Botswana	Sub-Saharan Africa	3.471	0.815	28.0	0.571	0.102	10.553629
141	Rwanda	Sub-Saharan Africa	3.268	0.133	46.2	0.621	0.544	9.616549
142	Zimbabwe	Sub-Saharan Africa	2.995	0.690	27.0	0.329	0.105	9.887221
143	Lebanon	Middle East and Northern Africa	2.955	0.498	63.1	0.103	0.034	10.442943
144	Afghanistan	Southern Asia	2.404	0.000	28.9	0.000	0.005	9.566054

	Country name	Ladder score	Standard error of ladder score	upperwhisker	lowerwhisker	Logged GDP per capita	Social support	Healthy life expectancy	Freedom to make life choices	Generosity	Perceptions of corruption	Ladder score in Dystopia	Explained by: Log GDP per capita	Explained by: Social support	Explained by: Healthy life expectancy	Explained by: Freedom to make life choices	Explained by: Generosity	Explained by: Perceptions of corruption	Dystopia + residual
0	Finland	7.804	0.036	7.875	7.733	10.792	0.969	71.150	0.961	-0.019	0.182	1.778	1.888	1.585	0.535	0.772	0.126	0.535	2.363
1	Denmark	7.586	0.041	7.667	7.506	10.962	0.954	71.250	0.934	0.134	0.196	1.778	1.949	1.548	0.537	0.734	0.208	0.525	2.084
2	Iceland	7.530	0.049	7.625	7.434	10.896	0.983	72.050	0.936	0.211	0.668	1.778	1.926	1.620	0.559	0.738	0.250	0.187	2.250
3	Israel	7.473	0.032	7.535	7.411	10.639	0.943	72.697	0.809	-0.023	0.708	1.778	1.833	1.521	0.577	0.569	0.124	0.158	2.691
4	Netherlands	7.403	0.029	7.460	7.346	10.942	0.930	71.550	0.887	0.213	0.379	1.778	1.942	1.488	0.545	0.672	0.251	0.394	2.110
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
132	Congo (Kinshasa)	3.207	0.095	3.394	3.020	7.007	0.652	55.375	0.664	0.086	0.834	1.778	0.531	0.784	0.105	0.375	0.183	0.068	1.162
133	Zimbabwe	3.204	0.061	3.323	3.084	7.641	0.690	54.050	0.654	-0.046	0.766	1.778	0.758	0.881	0.069	0.363	0.112	0.117	0.905
134	Sierra Leone	3.138	0.082	3.299	2.976	7.394	0.555	54.900	0.660	0.105	0.858	1.778	0.670	0.540	0.092	0.371	0.193	0.051	1.221
135	Lebanon	2.392	0.044	2.479	2.305	9.478	0.530	66.149	0.474	-0.141	0.891	1.778	1.417	0.476	0.398	0.123	0.061	0.027	-0.110
136	Afghanistan	1.859	0.033	1.923	1.795	7.324	0.341	54.712	0.382	-0.081	0.847	1.778	0.645	0.000	0.087	0.000	0.093	0.059	0.976

	Canada	Germany	India	Singapore	United States
count	4.000000	4.000000	4.000000	4.000000	4.000000
mean	2178.000000	3780.000000	6444.250000	1758.750000	71327.000000
std	1942.307562	5761.614068	4305.587407	1495.189258	57459.639412
min	45.000000	87.000000	200.000000	21.000000	6150.000000
25%	867.000000	270.000000	5288.000000	882.000000	35349.000000
50%	2163.000000	1377.500000	8052.500000	1765.000000	71129.000000
75%	3474.000000	4887.500000	9208.750000	2641.750000	107107.000000
max	4341.000000	12278.000000	9472.000000	3484.000000	136900.000000

	Country	Mean	Median	Mode	Range	Variance	Standard Deviation
0	Canada	2178.00	2163.0	45.0	4296.0	2.829419e+06	1682.087691
1	Germany	3780.00	1377.5	87.0	12191.0	2.489715e+07	4989.704150
2	India	6444.25	8052.5	200.0	9272.0	1.390356e+07	3728.748072
3	Singapore	1758.75	1765.0	21.0	3463.0	1.676693e+06	1294.871881
4	United States	71327.00	71129.0	6150.0	130750.0	2.476208e+09	49761.507423

	Happiness Score(2020)	Happiness Score(2021)	Happiness Score(2022)	Happiness Score(2023)	GDP per Capita(2020)	Annual GDP Growth(2020)	GDP per Capita(2021)	Annual GDP Growth(2021)	GDP per Capita(2022)	Annual GDP Growth(2022)	...	Life Expectancy(2022)	Life Expectancy(2023)	Freedom(2020)	Freedom(2021)	Freedom(2022)	Freedom(2023)	Corruption(2020)	Corruption(2021)	Corruption(2022)	Corruption(2023)
count	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	...	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000	78.000000
mean	5.759954	5.785679	5.772333	5.717577	9.354079	-4.057692	9.474013	6.702564	10.393149	4.282051	...	58.810256	64.897090	0.796752	0.800923	0.525218	0.792731	0.735960	0.731962	0.151410	0.721436
std	1.083606	1.063354	1.070501	1.117021	1.238501	4.681034	1.240384	3.347958	0.523038	2.149101	...	18.805357	6.295909	0.104412	0.101338	0.131739	0.099584	0.184427	0.186852	0.132619	0.191910
min	3.478900	3.467000	3.471000	3.138000	6.842167	-17.700000	6.958000	-1.200000	9.154818	-0.500000	...	19.100000	51.530000	0.541345	0.548000	0.180000	0.470000	0.168489	0.179000	0.000000	0.182000
25%	4.842650	4.963000	4.969250	4.879750	8.263101	-7.600000	8.535250	4.600000	10.082431	2.825000	...	40.300000	58.565000	0.732581	0.735500	0.448000	0.731250	0.714415	0.685000	0.066750	0.657750
50%	5.937600	5.950500	6.019000	5.995500	9.672709	-3.300000	9.784500	6.250000	10.570235	4.050000	...	65.050000	66.825500	0.817455	0.806500	0.545500	0.801500	0.801174	0.802000	0.104500	0.791000
75%	6.396025	6.434000	6.464000	6.450250	10.342724	-1.200000	10.494500	7.750000	10.814778	5.175000	...	76.050000	70.749250	0.875933	0.878250	0.620000	0.856000	0.850790	0.847750	0.188000	0.845500
max	7.808700	7.842000	7.821000	7.804000	11.450681	6.600000	11.647000	15.800000	11.109178	11.900000	...	82.200000	72.900000	0.955750	0.960000	0.736000	0.961000	0.935585	0.939000	0.534000	0.929000

	Mean	Median	Mode	Range	Variance	Standard Deviation
2020	5.759954	5.9376	3.4789	4.3298	1.174201	1.083606
2021	5.785679	5.9505	5.9290	4.3750	1.130722	1.063354
2022	5.772333	6.0190	5.0480	4.3500	1.145973	1.070501
2023	5.717577	5.9955	6.1250	4.6660	1.247735	1.117021

	Mean Happiness Score	Median Happiness Score	Mode Happiness Score	Range Happiness Score	Variance Happiness Score	Standard Deviation Happiness Score	Mean GDP per Capita	Median GDP per Capita	Mode GDP per Capita	Range GDP per Capita	...	Mode Freedom	Range Freedom	Variance Freedom	Standard Deviation Freedom	Mean Corruption	Median Corruption	Mode Corruption	Range Corruption	Variance Corruption	Standard Deviation Corruption
Region
Central and Eastern Europe	5.833829	5.9752	4.8827	1.4807	0.201006	0.448337	9.951746	10.089230	9.204430	1.187893	...	0.649566	0.286577	0.008346	0.091359	0.842183	0.868213	0.623074	0.312511	0.009424	0.097078
Latin America and Caribbean	6.170829	6.1634	5.6892	1.4322	0.127295	0.356785	9.403721	9.500369	8.424535	1.609699	...	0.744896	0.189843	0.001882	0.043388	0.798553	0.801215	0.635994	0.257721	0.004887	0.069905
Sub-Saharan Africa	4.568541	4.5830	3.4789	2.6224	0.305777	0.552972	8.014065	7.750990	6.842167	3.114619	...	0.547613	0.341922	0.008133	0.090185	0.798582	0.805343	0.683019	0.178855	0.002599	0.050977
Western Europe	6.967405	7.1291	5.5150	2.2937	0.388571	0.623354	10.702537	10.703229	10.132326	1.318355	...	0.541345	0.414405	0.011419	0.106861	0.523861	0.478188	0.168489	0.724140	0.058978	0.242854

	Mean	Median	Mode	Range	Variance	Standard Deviation
2020	-4.057692	-3.30	-1.8	24.3	21.912083	4.681034
2021	6.702564	6.25	4.6	17.0	11.208825	3.347958
2022	4.282051	4.05	4.0	12.4	4.618635	2.149101
2023	2.516667	2.60	2.0	9.5	3.827381	1.956369

	Mean	Median	Mode	Range	Variance	Standard Deviation
2020	0.825443	0.856406	0.468671	0.505998	0.013016	0.114088
2021	0.825936	0.859000	0.934000	0.494000	0.012838	0.113305
2022	0.929449	1.026000	0.865000	1.256000	0.078248	0.279728
2023	0.809679	0.850500	0.882000	0.546000	0.016568	0.128718

	Mean	Median	Mode	Range	Variance	Standard Deviation
2020	65.007615	67.400627	48.220539	26.182171	53.930014	7.343706
2021	65.271449	67.781500	72.600000	26.222000	52.877663	7.271703
2022	58.810256	65.050000	65.900000	63.100000	353.641452	18.805357
2023	64.897090	66.825500	71.150000	21.370000	39.638469	6.295909

Dep. Variable:	Happiness Score(2020)	R-squared:	0.839
Model:	OLS	Adj. R-squared:	0.825
Method:	Least Squares	F-statistic:	61.45
Date:	Wed, 17 Apr 2024	Prob (F-statistic):	3.75e-26
Time:	11:56:18	Log-Likelihood:	-45.323
No. Observations:	78	AIC:	104.6
Df Residuals:	71	BIC:	121.1
Df Model:	6
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	-1.1665	0.852	-1.370	0.175	-2.864	0.531
GDP per Capita(2020)	0.0555	0.109	0.509	0.613	-0.162	0.273
Annual GDP Growth(2020)	0.0115	0.014	0.811	0.420	-0.017	0.040
Social Support(2020)	1.8707	0.983	1.904	0.061	-0.089	3.830
Life Expectancy(2020)	0.0697	0.016	4.385	0.000	0.038	0.101
Freedom(2020)	1.7165	0.645	2.662	0.010	0.431	3.002
Corruption(2020)	-1.3460	0.375	-3.591	0.001	-2.093	-0.599

Omnibus:	18.536	Durbin-Watson:	1.344
Prob(Omnibus):	0.000	Jarque-Bera (JB):	30.864
Skew:	-0.901	Prob(JB):	1.99e-07
Kurtosis:	5.499	Cond. No.	1.33e+03

Forecasting 2024 Global Happiness Trends: Exploring the Impact of Cumulative Socio-Economic Influences¶

Overview¶

Motivation¶

Data Sources¶

Project Part I - Data Description & Manipulation¶

Data Description¶

Global GDP Growth Rate 2020-2023¶

Global Layoffs 2020-2023¶

World Income Groups 2022¶

World Happiness Report (2020-2022) & World Happiness Report 2023¶

Global Layoffs Dataset¶

World Income Groups Dataset¶

Global GDP Growth Rate Dataset¶

World Happiness Reports¶

Happiness Dataset - 2020¶

Happiness Dataset - 2021¶

Happiness Dataset - 2022¶

Happiness Dataset - 2023¶

Data Merging¶

Project Part II - Analysis¶

Descriptive Statistics¶

Inferential Statistics¶

Hypotheses¶

Correlation Analysis - 2020¶

Correlation Analysis - 2021¶

Correlation Analysis - 2022¶

Correlation Analysis - 2023¶

ANOVA Analysis - Layoffs and Other Variables (2020-2023)¶

Graphical Analysis¶

Joint Plots (2020-2023)¶

Correlation between Annual GDP Growth and Happiness Scores¶

Correlation between Life Expectancy and Happiness Scores¶

Outlier Boxplots (2020-2023)¶

Residual Plots (2020-2023)¶

Heatmaps (2020-2023)¶

Comparative Analysis¶

Scatter Plots (2020-2023)¶

Bar Plots (2020-2023)¶

Line Plots (2020-2023)¶

Multivariate Analysis¶

Cross-Tabulation (2020-2023)¶

Pivot Table (2020-2023)¶

Pair Plot (2020-2023)¶

Synthesis¶

Descriptive Analysis¶

Inferential Analysis¶

Graphical Analysis¶

Comparative Analysis¶

Multivariate Analysis¶

Insights from Combined Datasets¶

Summary¶

Reflection¶

Areas for Further Research and Improvement:¶

Project Part III - Machine Learning¶

Preprocessing Data¶

Feature Scaling¶

Label Encoding¶

Dimensionality Reduction¶

Data Preprocessing - Interpretation & Analysis¶

Feature Scaling and Normalization:¶

Label Encoding:¶

Dimensionality Reduction using PCA:¶

Concatenation of PCA Components:¶

Classification¶

Model Justification¶

Model Training¶

Model Tuning¶

Model Testing¶

Model Ensembling¶

Comparison Methods¶

Classification Results - Interpretation & Analysis¶

Insights and Findings: Forecasting 2024 Global Happiness Trends Based on Socio-Economic Factors¶

Omnibus:	21.389	Durbin-Watson:	1.200
Prob(Omnibus):	0.000	Jarque-Bera (JB):	35.939
Skew:	-1.044	Prob(JB):	1.57e-08
Kurtosis:	5.588	Cond. No.	1.41e+03

Omnibus:	9.419	Durbin-Watson:	1.560
Prob(Omnibus):	0.009	Jarque-Bera (JB):	9.366
Skew:	-0.701	Prob(JB):	0.00925
Kurtosis:	3.956	Cond. No.	2.57e+03

Omnibus:	12.819	Durbin-Watson:	1.624
Prob(Omnibus):	0.002	Jarque-Bera (JB):	13.600
Skew:	-0.912	Prob(JB):	0.00111
Kurtosis:	3.925	Cond. No.	1.43e+03

	2020				2021				2022				2023
Income Group	High income	Low income	Lower middle income	Upper middle income	High income	Low income	Lower middle income	Upper middle income	High income	Low income	Lower middle income	Upper middle income	High income	Low income	Lower middle income	Upper middle income
Region
Central and Eastern Europe	6.045725	0.0000	0.00000	5.551300	6.139625	0.000000	0.0000	5.704500	6.301000	0.000000	0.0000	5.753000	6.387000	0.000000	0.0000	5.768333
Latin America and Caribbean	6.324467	0.0000	6.04515	6.153367	6.261000	0.000000	5.9455	6.105333	6.318333	0.000000	6.0935	5.949167	6.364333	0.000000	6.1410	5.923750
Sub-Saharan Africa	0.000000	4.3706	4.71087	4.758940	0.000000	4.423333	4.7346	4.779600	0.000000	4.432667	4.6097	4.830600	0.000000	4.170167	4.5598	4.855600
Western Europe	6.967405	0.0000	0.00000	0.000000	6.983850	0.000000	0.0000	0.000000	6.967100	0.000000	0.0000	0.000000	6.894350	0.000000	0.0000	0.000000

	PC1	PC2	PC3
0	-6.191251	4.661359	0.809122
1	-6.174859	4.313292	0.317576
2	-5.855300	3.245911	0.937272
3	-4.710899	-0.534594	-1.640048
4	-5.971365	3.764458	0.094044
...	...	...	...
73	6.048755	-0.203244	2.353472
74	6.518907	0.861645	-0.027387
75	3.944231	0.898453	-1.235837
76	5.445202	2.753728	-0.289831
77	1.421981	-0.687209	-1.504214

	0
0	7.257876
1	7.287772
2	7.122208
3	7.257570
4	7.202756
...	...
73	4.281264
74	3.746645
75	4.201892
76	3.915991
77	4.031476

	Country	Happiness Score(2024)
0	Finland	7.741
1	Denmark	7.583
2	Iceland	7.525
3	Sweden	7.344
4	Israel	7.341
...	...	...
138	Congo (Kinshasa)	3.295
139	Sierra Leone	3.245
140	Lesotho	3.186
141	Lebanon	2.707
142	Afghanistan	1.721