# import the needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
from networkx.algorithms import bipartite

# import the datasets
games = pd.read_csv('data/games.csv')
recommendations = pd.read_csv('data/recommendations.csv')
users = pd.read_csv('data/users.csv')

games.sample(5)

# description of the games dataset
number_of_games = games.shape[0]
median_user_review = games['user_reviews'].median()
median_positive_ratio = games['positive_ratio'].median()

print(f'The dataset contains {number_of_games} games')
print(f'The median number of user review is {median_user_review}')
print(f'The median positive ratio is {median_positive_ratio}')

The dataset contains 50872 games
The median number of user review is 49.0
The median positive ratio is 81.0

# plot the distribution of ratings

plt.figure(figsize=(18, 9))

# Create the count plot
ax = sns.countplot(data=games, x='rating', color='skyblue')

# Annotate each bar with the height (count) value
for p in ax.patches:
    ax.annotate(
        f'{int(p.get_height())}',  # Convert height to an integer and format it as a string
        (p.get_x() + p.get_width() / 2, p.get_height()),  # Position the text at the center top of the bar
        ha='center',  # Center align the text horizontally
        va='bottom'  # Place the text just above the bar
    )

plt.title('Distribution of Rating Types for Games', fontsize=20, fontweight='bold')
plt.xlabel('Rating Type', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=14, fontweight='bold')
sns.despine()

# save the plot
plt.savefig('rating_distribution.png')

plt.show()

# Plot the distribution of positive_ratio
plt.figure(figsize=(10, 6))
sns.histplot(games['positive_ratio'], bins=20, kde=True, color='steelblue')

plt.title('Distribution of Positive Ratio for Game Ratings', fontsize=16, fontweight='bold')
plt.xlabel('Positive Ratio(%)', fontsize=12, fontweight='bold')
plt.ylabel('Frequency', fontsize=12, fontweight='bold')
sns.despine()
# save the plot
plt.savefig('positive_ratio_distribution.png')

plt.show()

# convert the user_id to a string in the recommendations dataset
recommendations['user_id'] = recommendations['user_id'].apply(lambda x: f'User_{x}')
recommendations.sample(5)

# convert the user_id to a string in the users dataset
users['user_id'] = users['user_id'].apply(lambda x: f'User_{x}')
users.sample(5)

# merge the recommendations and users dataframes by the user_id column
user_reviews = pd.merge(recommendations, users, on='user_id')

# delete the rows that is_recommended is False
user_reviews = user_reviews[user_reviews['is_recommended'] == True]

user_reviews.sample(5)

# create a small sample of the user_reviews dataset
sample_user_reviews = user_reviews.sample(10000, random_state=1)

# create a bipartite graph between users and games if is_recommended is True
G = nx.Graph()
G.add_nodes_from(sample_user_reviews['app_id'], bipartite=0)  # Add all app IDs
G.add_nodes_from(sample_user_reviews['user_id'], bipartite=1)  # Add all user IDs

# Add edges between app_id and user_id
G.add_edges_from(zip(sample_user_reviews['app_id'], sample_user_reviews['user_id']))

# store the games' titles as a dictionary
games_titles = games.set_index('app_id')['title'].to_dict()

# check if the graph is bipartite
nx.is_bipartite(G)

True

# Get the two node sets
game_nodes = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
user_nodes = set(G) - game_nodes

# Basic metrics
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Average degree:", sum(dict(G.degree()).values()) / G.number_of_nodes())

Number of nodes: 12845
Number of edges: 10000
Average degree: 1.5570260801868432

# Projected graphs
# Project onto game nodes (game-game projection)
game_projection = bipartite.weighted_projected_graph(G, game_nodes)
print("Game-Game Projection - Number of nodes:", game_projection.number_of_nodes())
print("Game-Game Projection - Number of edges:", game_projection.number_of_edges())

# Project onto user nodes (user-user projection)
user_projection = bipartite.weighted_projected_graph(G, user_nodes)
print("User-User Projection - Number of nodes:", user_projection.number_of_nodes())
print("User-User Projection - Number of edges:", user_projection.number_of_edges())

# Centrality analysis (on original graph)
game_centrality = bipartite.degree_centrality(G, game_nodes)
user_centrality = bipartite.degree_centrality(G, user_nodes)
print("Top 5 Games by Degree Centrality:", sorted(game_centrality.items(), key=lambda x: x[1], reverse=True)[:5])
print("Top 5 Users by Degree Centrality:", sorted(user_centrality.items(), key=lambda x: x[1], reverse=True)[:5])

# Clustering coefficient
game_clustering = nx.clustering(game_projection)
user_clustering = nx.clustering(user_projection)
print("Top 5 Games by Clustering Coefficient:", sorted(game_clustering.items(), key=lambda x: x[1], reverse=True)[:5])
print("Top 5 Users by Clustering Coefficient:", sorted(user_clustering.items(), key=lambda x: x[1], reverse=True)[:5])

Game-Game Projection - Number of nodes: 2867
Game-Game Projection - Number of edges: 22
User-User Projection - Number of nodes: 9978
User-User Projection - Number of edges: 69013
Top 5 Games by Degree Centrality: [(440, 0.009921828021647625), (252490, 0.0065143315293646024), (570, 0.005913008618961716), (1091500, 0.005712567648827421), (730, 0.005512126678693125)]
Top 5 Users by Degree Centrality: [(440, 0.009921828021647625), (252490, 0.0065143315293646024), (570, 0.005913008618961716), (1091500, 0.005712567648827421), (730, 0.005512126678693125)]
Top 5 Games by Clustering Coefficient: [(688130, 0), (65540, 0), (262150, 0), (868360, 0), (40970, 0)]
Top 5 Users by Clustering Coefficient: [('User_8649005', 1.0), ('User_11851626', 1.0), ('User_6946150', 1.0), ('User_13815829', 1.0), ('User_1372818', 1.0)]

# print the top 5 games titles and their degree centrality
top_5_games = sorted(game_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
for game, centrality in top_5_games:
    print(f'{games_titles[game]}: {centrality:.4f}')

Team Fortress 2: 0.0099
Rust: 0.0065
Dota 2: 0.0059
Cyberpunk 2077: 0.0057
Counter-Strike: Global Offensive: 0.0055

# print the top 5 games and their clustering coefficient
top_5_games = sorted(game_clustering.items(), key=lambda x: x[1], reverse=True)[:5]
for game, centrality in top_5_games:
    print(f'{games_titles[game]}: {centrality:.4f}')

Pogostuck: Rage With Your Friends: 0.0000
Gothic 1: 0.0000
Vanguard Princess: 0.0000
Project Hospital: 0.0000
Stronghold Crusader HD: 0.0000

# Community detection
from networkx.algorithms import community

communities = community.louvain_communities(G)
print("Number of detected communities:", len(communities))

# print the features of the largest community
largest_community = max(communities, key=len)
print("Number of nodes in largest community:", len(largest_community))

# print the games in the largest community
print("Games in largest community:")
for node in largest_community:
    if node in game_nodes:
        print(games_titles[node])

Number of detected communities: 2845
Number of nodes in largest community: 100
Games in largest community:
Team Fortress 2

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import random
# Prepare Data for Link Prediction
# Extract existing edges and generate non-edges from the user projection graph
edges = list(user_projection.edges())
non_edges = list(nx.non_edges(user_projection))

# Split into train and test sets
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)
train_non_edges, test_non_edges = train_test_split(non_edges, test_size=0.2, random_state=42)

# Feature Extraction
def generate_features(graph, edges, label):
    features = []
    for u, v in edges:
        # Jaccard similarity
        jaccard = list(nx.jaccard_coefficient(graph, [(u, v)]))[0][2]

        # Common neighbors
        common_neighbors = len(list(nx.common_neighbors(graph, u, v)))

        # Preferential attachment
        preferential_attachment = list(nx.preferential_attachment(graph, [(u, v)]))[0][2]

        features.append([jaccard, common_neighbors, preferential_attachment, label])
    return features

# Generate training and testing datasets
train_positive = generate_features(user_projection, train_edges, 1)
train_negative = generate_features(user_projection, random.sample(train_non_edges, len(train_edges)), 0)
test_positive = generate_features(user_projection, test_edges, 1)
test_negative = generate_features(user_projection, random.sample(test_non_edges, len(test_edges)), 0)

# Combine datasets
columns = ['jaccard', 'common_neighbors', 'preferential_attachment', 'label']
train_data = pd.DataFrame(train_positive + train_negative, columns=columns)
test_data = pd.DataFrame(test_positive + test_negative, columns=columns)

# Train Link Prediction Model
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('label', axis=1)
y_test = test_data['label']

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC Score: {roc_auc}")

Accuracy: 0.997174527276679
ROC AUC Score: 0.9999768689112704

predicted_links = [(u, v) for (u, v), pred in zip(test_edges, y_pred) if pred == 1]

def recommend_top_games_by_hours(graph, user, predicted_user, game_hours, top_n=1):
    """
    Recommend the top-n games by hours played by the predicted user that the original user hasn't played.

    Parameters:
        graph (networkx.Graph): The original bipartite graph.
        user (str): The user to recommend games for.
        predicted_user (str): The user connected via a predicted link.
        game_hours (pd.Series): A mapping of game IDs to their total hours played.
        top_n (int): Number of top games to recommend.

    Returns:
        List of recommended games.
    """
    # Games played by each user
    games_user = set(graph.neighbors(user))
    games_predicted_user = set(graph.neighbors(predicted_user))
    
    # Find games played by the predicted user but not the original user
    unplayed_games = games_predicted_user - games_user
    
    # Rank games by hours played
    ranked_games = sorted(
        unplayed_games,
        key=lambda game: game_hours.get(game, 0),  # Default hours played to 0 if missing
        reverse=True
    )
    
    return ranked_games[:top_n]

# Get the total hours played for each game
game_hours = sample_user_reviews[['app_id', 'hours']].set_index('app_id')['hours']

# Generate recommendations for each predicted link
top_n = 1
recommendations = {}

for u, v in predicted_links:
    # Recommend for user u
    rec_u = recommend_top_games_by_hours(G, u, v, game_hours, top_n)
    if rec_u:
        recommendations[u] = rec_u
    
    # Recommend for user v
    rec_v = recommend_top_games_by_hours(G, v, u, game_hours, top_n)
    if rec_v:
        recommendations[v] = rec_v

# Display recommendations with game titles
for user, recs in recommendations.items():
    print(f"Recommendations for {user}:")
    for i, game in enumerate(recs, 1):
        print(f"{games_titles[game]}")
    print()

Recommendations for User_11528755:
Rebuild 3: Gangs of Deadsville

Recommendations for User_3169283:
Left 4 Dead 2

Recommendations for User_8216702:
The Forest

Recommendations for User_11314769:
Hurtworld

Recommendations for User_2515142:
Detroit: Become Human

Recommendations for User_3478715:
Warframe

Recommendations for User_12360999:
Hurtworld

Recommendations for User_2627399:
VRChat

Recommendations for User_8100846:
Rebuild 3: Gangs of Deadsville

Recommendations for User_4839232:
Oh...Sir!! The Insult Simulator

Recommendations for User_7278557:
VRChat

Recommendations for User_4428884:
Slime Rancher

Recommendations for User_4384432:
Dead Rising 3 Apocalypse Edition

Recommendations for User_1490364:
Left 4 Dead 2

Recommendations for User_3159603:
DARK SOULS™ II

Recommendations for User_6154000:
Oh...Sir!! The Insult Simulator

Recommendations for User_8751113:
Oh...Sir!! The Insult Simulator

Recommendations for User_12897038:
Knight Online

Recommendations for User_6845471:
Slime Rancher

Recommendations for User_1330371:
Detroit: Become Human

Recommendations for User_3215691:
Tomb Raider: Underworld

Recommendations for User_4354181:
KINGDOMS

Recommendations for User_9015003:
VRChat

Recommendations for User_3956653:
Detroit: Become Human

Recommendations for User_7220345:
Wizard And Minion Idle

Recommendations for User_14276710:
The Forest

Recommendations for User_1087238:
Oh...Sir!! The Insult Simulator

Recommendations for User_12988151:
Slime Rancher

Recommendations for User_8977050:
Detroit: Become Human

Recommendations for User_4041624:
Detroit: Become Human

Recommendations for User_2054864:
Detroit: Become Human

Recommendations for User_2479280:
Detroit: Become Human

Recommendations for User_5436587:
VRChat

Recommendations for User_14091926:
Oh...Sir!! The Insult Simulator

Recommendations for User_10801194:
Slime Rancher

Recommendations for User_4320885:
Dead Rising 3 Apocalypse Edition

Recommendations for User_11643239:
Oh...Sir!! The Insult Simulator

Recommendations for User_4647956:
VRChat

Recommendations for User_8056987:
The Forest

Recommendations for User_10291003:
The Forest

Recommendations for User_821258:
VRChat

Recommendations for User_6198361:
The Forest

Recommendations for User_12660619:
Slime Rancher

Recommendations for User_11794773:
Hyperdimension Neptunia Re;Birth1

Recommendations for User_5001327:
Slime Rancher

Recommendations for User_8266521:
VRChat

Recommendations for User_4561435:
Detroit: Become Human

Recommendations for User_7314572:
KINGDOMS

Recommendations for User_13849442:
Detroit: Become Human

Recommendations for User_3336928:
Marvel’s Spider-Man Remastered

Recommendations for User_9008925:
VRChat

Recommendations for User_6212642:
Wizard And Minion Idle

Recommendations for User_14253764:
Slime Rancher

Recommendations for User_2274600:
Slime Rancher

Recommendations for User_8728442:
The Forest

Recommendations for User_444465:
Detroit: Become Human

Recommendations for User_5524561:
Hyperdimension Neptunia Re;Birth1

Recommendations for User_3797421:
Siege Survival: Gloria Victis

Recommendations for User_4855471:
Slime Rancher

	app_id	title	date_release	win	mac	linux	rating	positive_ratio	user_reviews	price_final	price_original	steam_deck
25957	790290	Lonely in the Winter	2018-02-15	True	False	False	Positive	91	36	2.99	2.99	True
1029	667600	Sky Force Reloaded	2017-11-29	True	True	True	Very Positive	93	1819	9.99	9.99	True
38784	869760	Stoneshard: Prologue	2018-06-05	True	False	True	Very Positive	87	3508	0.00	0.00	True
44046	976410	Infinite Children	2019-05-06	True	True	False	Mixed	63	135	0.00	0.00	True
12285	504390	Along the Edge	2016-10-12	True	True	True	Very Positive	90	294	13.99	13.99	True

	app_id	helpful	date	is_recommended	hours	user_id	review_id
5818036	271590	3	2020-07-13	False	75.0	User_9834235	5818036
12626472	292030	0	2022-06-30	True	54.3	User_9306483	12626472
21575847	518790	0	2020-03-26	True	15.1	User_9992984	21575847
26165166	431960	0	2021-01-15	True	16.3	User_1482254	26165166
13034357	292030	0	2022-12-27	True	47.2	User_14153991	13034357

	user_id	products	reviews
6155976	User_10793219	394	1
5850670	User_13145268	63	7
2254244	User_7952159	36	1
11408946	User_8050062	191	4
9805978	User_11181558	18	3

	app_id	helpful	funny	date	is_recommended	hours	user_id	review_id	products	reviews
33998452	1938090	44	3	2022-10-27	True	507.4	User_7375375	33998452	118	9
39513748	283290	0	0	2020-10-07	True	6.0	User_11181881	39513748	947	124
31436022	542050	0	0	2020-06-21	True	42.3	User_4166972	31436022	2141	23
21557736	1248130	0	0	2022-06-14	True	206.6	User_10266351	21557736	27	2
34194651	747660	0	0	2022-03-19	True	45.1	User_3948669	34194651	9	1

Game Recommendation System - Exploring the Games&Users Network using Steam Data¶

1. Data Exploration¶

2. Create the Bipartite graph between the games and users¶

3. Graph Exploration¶

4. Link Prediction¶