Game Recommendation System - Exploring the Games&Users Network using Steam Data¶
In [73]:
# import the needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
from networkx.algorithms import bipartite
1. Data Exploration¶
In [74]:
# import the datasets
games = pd.read_csv('data/games.csv')
recommendations = pd.read_csv('data/recommendations.csv')
users = pd.read_csv('data/users.csv')
In [75]:
games.sample(5)
Out[75]:
app_id | title | date_release | win | mac | linux | rating | positive_ratio | user_reviews | price_final | price_original | discount | steam_deck | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
25957 | 790290 | Lonely in the Winter | 2018-02-15 | True | False | False | Positive | 91 | 36 | 2.99 | 2.99 | 0.0 | True |
1029 | 667600 | Sky Force Reloaded | 2017-11-29 | True | True | True | Very Positive | 93 | 1819 | 9.99 | 9.99 | 0.0 | True |
38784 | 869760 | Stoneshard: Prologue | 2018-06-05 | True | False | True | Very Positive | 87 | 3508 | 0.00 | 0.00 | 0.0 | True |
44046 | 976410 | Infinite Children | 2019-05-06 | True | True | False | Mixed | 63 | 135 | 0.00 | 0.00 | 0.0 | True |
12285 | 504390 | Along the Edge | 2016-10-12 | True | True | True | Very Positive | 90 | 294 | 13.99 | 13.99 | 0.0 | True |
In [76]:
# description of the games dataset
number_of_games = games.shape[0]
median_user_review = games['user_reviews'].median()
median_positive_ratio = games['positive_ratio'].median()
print(f'The dataset contains {number_of_games} games')
print(f'The median number of user review is {median_user_review}')
print(f'The median positive ratio is {median_positive_ratio}')
The dataset contains 50872 games The median number of user review is 49.0 The median positive ratio is 81.0
In [77]:
# plot the distribution of ratings
plt.figure(figsize=(18, 9))
# Create the count plot
ax = sns.countplot(data=games, x='rating', color='skyblue')
# Annotate each bar with the height (count) value
for p in ax.patches:
ax.annotate(
f'{int(p.get_height())}', # Convert height to an integer and format it as a string
(p.get_x() + p.get_width() / 2, p.get_height()), # Position the text at the center top of the bar
ha='center', # Center align the text horizontally
va='bottom' # Place the text just above the bar
)
plt.title('Distribution of Rating Types for Games', fontsize=20, fontweight='bold')
plt.xlabel('Rating Type', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=14, fontweight='bold')
sns.despine()
# save the plot
plt.savefig('rating_distribution.png')
plt.show()
In [78]:
# Plot the distribution of positive_ratio
plt.figure(figsize=(10, 6))
sns.histplot(games['positive_ratio'], bins=20, kde=True, color='steelblue')
plt.title('Distribution of Positive Ratio for Game Ratings', fontsize=16, fontweight='bold')
plt.xlabel('Positive Ratio(%)', fontsize=12, fontweight='bold')
plt.ylabel('Frequency', fontsize=12, fontweight='bold')
sns.despine()
# save the plot
plt.savefig('positive_ratio_distribution.png')
plt.show()
2. Create the Bipartite graph between the games and users¶
In [79]:
# convert the user_id to a string in the recommendations dataset
recommendations['user_id'] = recommendations['user_id'].apply(lambda x: f'User_{x}')
recommendations.sample(5)
Out[79]:
app_id | helpful | funny | date | is_recommended | hours | user_id | review_id | |
---|---|---|---|---|---|---|---|---|
5818036 | 271590 | 3 | 0 | 2020-07-13 | False | 75.0 | User_9834235 | 5818036 |
12626472 | 292030 | 0 | 0 | 2022-06-30 | True | 54.3 | User_9306483 | 12626472 |
21575847 | 518790 | 0 | 0 | 2020-03-26 | True | 15.1 | User_9992984 | 21575847 |
26165166 | 431960 | 0 | 0 | 2021-01-15 | True | 16.3 | User_1482254 | 26165166 |
13034357 | 292030 | 0 | 0 | 2022-12-27 | True | 47.2 | User_14153991 | 13034357 |
In [80]:
# convert the user_id to a string in the users dataset
users['user_id'] = users['user_id'].apply(lambda x: f'User_{x}')
users.sample(5)
Out[80]:
user_id | products | reviews | |
---|---|---|---|
6155976 | User_10793219 | 394 | 1 |
5850670 | User_13145268 | 63 | 7 |
2254244 | User_7952159 | 36 | 1 |
11408946 | User_8050062 | 191 | 4 |
9805978 | User_11181558 | 18 | 3 |
In [81]:
# merge the recommendations and users dataframes by the user_id column
user_reviews = pd.merge(recommendations, users, on='user_id')
# delete the rows that is_recommended is False
user_reviews = user_reviews[user_reviews['is_recommended'] == True]
user_reviews.sample(5)
Out[81]:
app_id | helpful | funny | date | is_recommended | hours | user_id | review_id | products | reviews | |
---|---|---|---|---|---|---|---|---|---|---|
33998452 | 1938090 | 44 | 3 | 2022-10-27 | True | 507.4 | User_7375375 | 33998452 | 118 | 9 |
39513748 | 283290 | 0 | 0 | 2020-10-07 | True | 6.0 | User_11181881 | 39513748 | 947 | 124 |
31436022 | 542050 | 0 | 0 | 2020-06-21 | True | 42.3 | User_4166972 | 31436022 | 2141 | 23 |
21557736 | 1248130 | 0 | 0 | 2022-06-14 | True | 206.6 | User_10266351 | 21557736 | 27 | 2 |
34194651 | 747660 | 0 | 0 | 2022-03-19 | True | 45.1 | User_3948669 | 34194651 | 9 | 1 |
In [82]:
# create a small sample of the user_reviews dataset
sample_user_reviews = user_reviews.sample(10000, random_state=1)
In [83]:
# create a bipartite graph between users and games if is_recommended is True
G = nx.Graph()
G.add_nodes_from(sample_user_reviews['app_id'], bipartite=0) # Add all app IDs
G.add_nodes_from(sample_user_reviews['user_id'], bipartite=1) # Add all user IDs
# Add edges between app_id and user_id
G.add_edges_from(zip(sample_user_reviews['app_id'], sample_user_reviews['user_id']))
In [84]:
# store the games' titles as a dictionary
games_titles = games.set_index('app_id')['title'].to_dict()
In [85]:
# check if the graph is bipartite
nx.is_bipartite(G)
Out[85]:
True
3. Graph Exploration¶
In [86]:
# Get the two node sets
game_nodes = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
user_nodes = set(G) - game_nodes
# Basic metrics
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Average degree:", sum(dict(G.degree()).values()) / G.number_of_nodes())
Number of nodes: 12845 Number of edges: 10000 Average degree: 1.5570260801868432
In [87]:
# Projected graphs
# Project onto game nodes (game-game projection)
game_projection = bipartite.weighted_projected_graph(G, game_nodes)
print("Game-Game Projection - Number of nodes:", game_projection.number_of_nodes())
print("Game-Game Projection - Number of edges:", game_projection.number_of_edges())
# Project onto user nodes (user-user projection)
user_projection = bipartite.weighted_projected_graph(G, user_nodes)
print("User-User Projection - Number of nodes:", user_projection.number_of_nodes())
print("User-User Projection - Number of edges:", user_projection.number_of_edges())
# Centrality analysis (on original graph)
game_centrality = bipartite.degree_centrality(G, game_nodes)
user_centrality = bipartite.degree_centrality(G, user_nodes)
print("Top 5 Games by Degree Centrality:", sorted(game_centrality.items(), key=lambda x: x[1], reverse=True)[:5])
print("Top 5 Users by Degree Centrality:", sorted(user_centrality.items(), key=lambda x: x[1], reverse=True)[:5])
# Clustering coefficient
game_clustering = nx.clustering(game_projection)
user_clustering = nx.clustering(user_projection)
print("Top 5 Games by Clustering Coefficient:", sorted(game_clustering.items(), key=lambda x: x[1], reverse=True)[:5])
print("Top 5 Users by Clustering Coefficient:", sorted(user_clustering.items(), key=lambda x: x[1], reverse=True)[:5])
Game-Game Projection - Number of nodes: 2867 Game-Game Projection - Number of edges: 22 User-User Projection - Number of nodes: 9978 User-User Projection - Number of edges: 69013 Top 5 Games by Degree Centrality: [(440, 0.009921828021647625), (252490, 0.0065143315293646024), (570, 0.005913008618961716), (1091500, 0.005712567648827421), (730, 0.005512126678693125)] Top 5 Users by Degree Centrality: [(440, 0.009921828021647625), (252490, 0.0065143315293646024), (570, 0.005913008618961716), (1091500, 0.005712567648827421), (730, 0.005512126678693125)] Top 5 Games by Clustering Coefficient: [(688130, 0), (65540, 0), (262150, 0), (868360, 0), (40970, 0)] Top 5 Users by Clustering Coefficient: [('User_8649005', 1.0), ('User_11851626', 1.0), ('User_6946150', 1.0), ('User_13815829', 1.0), ('User_1372818', 1.0)]
In [88]:
# print the top 5 games titles and their degree centrality
top_5_games = sorted(game_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
for game, centrality in top_5_games:
print(f'{games_titles[game]}: {centrality:.4f}')
Team Fortress 2: 0.0099 Rust: 0.0065 Dota 2: 0.0059 Cyberpunk 2077: 0.0057 Counter-Strike: Global Offensive: 0.0055
In [89]:
# print the top 5 games and their clustering coefficient
top_5_games = sorted(game_clustering.items(), key=lambda x: x[1], reverse=True)[:5]
for game, centrality in top_5_games:
print(f'{games_titles[game]}: {centrality:.4f}')
Pogostuck: Rage With Your Friends: 0.0000 Gothic 1: 0.0000 Vanguard Princess: 0.0000 Project Hospital: 0.0000 Stronghold Crusader HD: 0.0000
In [ ]:
# Community detection
from networkx.algorithms import community
communities = community.louvain_communities(G)
print("Number of detected communities:", len(communities))
# print the features of the largest community
largest_community = max(communities, key=len)
print("Number of nodes in largest community:", len(largest_community))
# print the games in the largest community
print("Games in largest community:")
for node in largest_community:
if node in game_nodes:
print(games_titles[node])
Number of detected communities: 2845 Number of nodes in largest community: 100 Games in largest community: Team Fortress 2
4. Link Prediction¶
In [91]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import random
# Prepare Data for Link Prediction
# Extract existing edges and generate non-edges from the user projection graph
edges = list(user_projection.edges())
non_edges = list(nx.non_edges(user_projection))
# Split into train and test sets
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)
train_non_edges, test_non_edges = train_test_split(non_edges, test_size=0.2, random_state=42)
# Feature Extraction
def generate_features(graph, edges, label):
features = []
for u, v in edges:
# Jaccard similarity
jaccard = list(nx.jaccard_coefficient(graph, [(u, v)]))[0][2]
# Common neighbors
common_neighbors = len(list(nx.common_neighbors(graph, u, v)))
# Preferential attachment
preferential_attachment = list(nx.preferential_attachment(graph, [(u, v)]))[0][2]
features.append([jaccard, common_neighbors, preferential_attachment, label])
return features
# Generate training and testing datasets
train_positive = generate_features(user_projection, train_edges, 1)
train_negative = generate_features(user_projection, random.sample(train_non_edges, len(train_edges)), 0)
test_positive = generate_features(user_projection, test_edges, 1)
test_negative = generate_features(user_projection, random.sample(test_non_edges, len(test_edges)), 0)
# Combine datasets
columns = ['jaccard', 'common_neighbors', 'preferential_attachment', 'label']
train_data = pd.DataFrame(train_positive + train_negative, columns=columns)
test_data = pd.DataFrame(test_positive + test_negative, columns=columns)
# Train Link Prediction Model
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('label', axis=1)
y_test = test_data['label']
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Accuracy: {accuracy}")
print(f"ROC AUC Score: {roc_auc}")
Accuracy: 0.997174527276679 ROC AUC Score: 0.9999768689112704
In [93]:
predicted_links = [(u, v) for (u, v), pred in zip(test_edges, y_pred) if pred == 1]
In [100]:
def recommend_top_games_by_hours(graph, user, predicted_user, game_hours, top_n=1):
"""
Recommend the top-n games by hours played by the predicted user that the original user hasn't played.
Parameters:
graph (networkx.Graph): The original bipartite graph.
user (str): The user to recommend games for.
predicted_user (str): The user connected via a predicted link.
game_hours (pd.Series): A mapping of game IDs to their total hours played.
top_n (int): Number of top games to recommend.
Returns:
List of recommended games.
"""
# Games played by each user
games_user = set(graph.neighbors(user))
games_predicted_user = set(graph.neighbors(predicted_user))
# Find games played by the predicted user but not the original user
unplayed_games = games_predicted_user - games_user
# Rank games by hours played
ranked_games = sorted(
unplayed_games,
key=lambda game: game_hours.get(game, 0), # Default hours played to 0 if missing
reverse=True
)
return ranked_games[:top_n]
# Get the total hours played for each game
game_hours = sample_user_reviews[['app_id', 'hours']].set_index('app_id')['hours']
# Generate recommendations for each predicted link
top_n = 1
recommendations = {}
for u, v in predicted_links:
# Recommend for user u
rec_u = recommend_top_games_by_hours(G, u, v, game_hours, top_n)
if rec_u:
recommendations[u] = rec_u
# Recommend for user v
rec_v = recommend_top_games_by_hours(G, v, u, game_hours, top_n)
if rec_v:
recommendations[v] = rec_v
# Display recommendations with game titles
for user, recs in recommendations.items():
print(f"Recommendations for {user}:")
for i, game in enumerate(recs, 1):
print(f"{games_titles[game]}")
print()
Recommendations for User_11528755: Rebuild 3: Gangs of Deadsville Recommendations for User_3169283: Left 4 Dead 2 Recommendations for User_8216702: The Forest Recommendations for User_11314769: Hurtworld Recommendations for User_2515142: Detroit: Become Human Recommendations for User_3478715: Warframe Recommendations for User_12360999: Hurtworld Recommendations for User_2627399: VRChat Recommendations for User_8100846: Rebuild 3: Gangs of Deadsville Recommendations for User_4839232: Oh...Sir!! The Insult Simulator Recommendations for User_7278557: VRChat Recommendations for User_4428884: Slime Rancher Recommendations for User_4384432: Dead Rising 3 Apocalypse Edition Recommendations for User_1490364: Left 4 Dead 2 Recommendations for User_3159603: DARK SOULS™ II Recommendations for User_6154000: Oh...Sir!! The Insult Simulator Recommendations for User_8751113: Oh...Sir!! The Insult Simulator Recommendations for User_12897038: Knight Online Recommendations for User_6845471: Slime Rancher Recommendations for User_1330371: Detroit: Become Human Recommendations for User_3215691: Tomb Raider: Underworld Recommendations for User_4354181: KINGDOMS Recommendations for User_9015003: VRChat Recommendations for User_3956653: Detroit: Become Human Recommendations for User_7220345: Wizard And Minion Idle Recommendations for User_14276710: The Forest Recommendations for User_1087238: Oh...Sir!! The Insult Simulator Recommendations for User_12988151: Slime Rancher Recommendations for User_8977050: Detroit: Become Human Recommendations for User_4041624: Detroit: Become Human Recommendations for User_2054864: Detroit: Become Human Recommendations for User_2479280: Detroit: Become Human Recommendations for User_5436587: VRChat Recommendations for User_14091926: Oh...Sir!! The Insult Simulator Recommendations for User_10801194: Slime Rancher Recommendations for User_4320885: Dead Rising 3 Apocalypse Edition Recommendations for User_11643239: Oh...Sir!! The Insult Simulator Recommendations for User_4647956: VRChat Recommendations for User_8056987: The Forest Recommendations for User_10291003: The Forest Recommendations for User_821258: VRChat Recommendations for User_6198361: The Forest Recommendations for User_12660619: Slime Rancher Recommendations for User_11794773: Hyperdimension Neptunia Re;Birth1 Recommendations for User_5001327: Slime Rancher Recommendations for User_8266521: VRChat Recommendations for User_4561435: Detroit: Become Human Recommendations for User_7314572: KINGDOMS Recommendations for User_13849442: Detroit: Become Human Recommendations for User_3336928: Marvel’s Spider-Man Remastered Recommendations for User_9008925: VRChat Recommendations for User_6212642: Wizard And Minion Idle Recommendations for User_14253764: Slime Rancher Recommendations for User_2274600: Slime Rancher Recommendations for User_8728442: The Forest Recommendations for User_444465: Detroit: Become Human Recommendations for User_5524561: Hyperdimension Neptunia Re;Birth1 Recommendations for User_3797421: Siege Survival: Gloria Victis Recommendations for User_4855471: Slime Rancher