Game Recommendation System - Exploring the Games&Users Network using Steam Data¶

In [73]:
# import the needed libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
from networkx.algorithms import bipartite

1. Data Exploration¶

In [74]:
# import the datasets
games = pd.read_csv('data/games.csv')
recommendations = pd.read_csv('data/recommendations.csv')
users = pd.read_csv('data/users.csv')
In [75]:
games.sample(5)
Out[75]:
app_id title date_release win mac linux rating positive_ratio user_reviews price_final price_original discount steam_deck
25957 790290 Lonely in the Winter 2018-02-15 True False False Positive 91 36 2.99 2.99 0.0 True
1029 667600 Sky Force Reloaded 2017-11-29 True True True Very Positive 93 1819 9.99 9.99 0.0 True
38784 869760 Stoneshard: Prologue 2018-06-05 True False True Very Positive 87 3508 0.00 0.00 0.0 True
44046 976410 Infinite Children 2019-05-06 True True False Mixed 63 135 0.00 0.00 0.0 True
12285 504390 Along the Edge 2016-10-12 True True True Very Positive 90 294 13.99 13.99 0.0 True
In [76]:
# description of the games dataset
number_of_games = games.shape[0]
median_user_review = games['user_reviews'].median()
median_positive_ratio = games['positive_ratio'].median()

print(f'The dataset contains {number_of_games} games')
print(f'The median number of user review is {median_user_review}')
print(f'The median positive ratio is {median_positive_ratio}')
The dataset contains 50872 games
The median number of user review is 49.0
The median positive ratio is 81.0
In [77]:
# plot the distribution of ratings

plt.figure(figsize=(18, 9))

# Create the count plot
ax = sns.countplot(data=games, x='rating', color='skyblue')

# Annotate each bar with the height (count) value
for p in ax.patches:
    ax.annotate(
        f'{int(p.get_height())}',  # Convert height to an integer and format it as a string
        (p.get_x() + p.get_width() / 2, p.get_height()),  # Position the text at the center top of the bar
        ha='center',  # Center align the text horizontally
        va='bottom'  # Place the text just above the bar
    )

plt.title('Distribution of Rating Types for Games', fontsize=20, fontweight='bold')
plt.xlabel('Rating Type', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=14, fontweight='bold')
sns.despine()

# save the plot
plt.savefig('rating_distribution.png')

plt.show()
No description has been provided for this image
In [78]:
# Plot the distribution of positive_ratio
plt.figure(figsize=(10, 6))
sns.histplot(games['positive_ratio'], bins=20, kde=True, color='steelblue')

plt.title('Distribution of Positive Ratio for Game Ratings', fontsize=16, fontweight='bold')
plt.xlabel('Positive Ratio(%)', fontsize=12, fontweight='bold')
plt.ylabel('Frequency', fontsize=12, fontweight='bold')
sns.despine()
# save the plot
plt.savefig('positive_ratio_distribution.png')

plt.show()
No description has been provided for this image

2. Create the Bipartite graph between the games and users¶

In [79]:
# convert the user_id to a string in the recommendations dataset
recommendations['user_id'] = recommendations['user_id'].apply(lambda x: f'User_{x}')
recommendations.sample(5)
Out[79]:
app_id helpful funny date is_recommended hours user_id review_id
5818036 271590 3 0 2020-07-13 False 75.0 User_9834235 5818036
12626472 292030 0 0 2022-06-30 True 54.3 User_9306483 12626472
21575847 518790 0 0 2020-03-26 True 15.1 User_9992984 21575847
26165166 431960 0 0 2021-01-15 True 16.3 User_1482254 26165166
13034357 292030 0 0 2022-12-27 True 47.2 User_14153991 13034357
In [80]:
# convert the user_id to a string in the users dataset
users['user_id'] = users['user_id'].apply(lambda x: f'User_{x}')
users.sample(5)
Out[80]:
user_id products reviews
6155976 User_10793219 394 1
5850670 User_13145268 63 7
2254244 User_7952159 36 1
11408946 User_8050062 191 4
9805978 User_11181558 18 3
In [81]:
# merge the recommendations and users dataframes by the user_id column
user_reviews = pd.merge(recommendations, users, on='user_id')

# delete the rows that is_recommended is False
user_reviews = user_reviews[user_reviews['is_recommended'] == True]

user_reviews.sample(5)
Out[81]:
app_id helpful funny date is_recommended hours user_id review_id products reviews
33998452 1938090 44 3 2022-10-27 True 507.4 User_7375375 33998452 118 9
39513748 283290 0 0 2020-10-07 True 6.0 User_11181881 39513748 947 124
31436022 542050 0 0 2020-06-21 True 42.3 User_4166972 31436022 2141 23
21557736 1248130 0 0 2022-06-14 True 206.6 User_10266351 21557736 27 2
34194651 747660 0 0 2022-03-19 True 45.1 User_3948669 34194651 9 1
In [82]:
# create a small sample of the user_reviews dataset
sample_user_reviews = user_reviews.sample(10000, random_state=1)
In [83]:
# create a bipartite graph between users and games if is_recommended is True
G = nx.Graph()
G.add_nodes_from(sample_user_reviews['app_id'], bipartite=0)  # Add all app IDs
G.add_nodes_from(sample_user_reviews['user_id'], bipartite=1)  # Add all user IDs

# Add edges between app_id and user_id
G.add_edges_from(zip(sample_user_reviews['app_id'], sample_user_reviews['user_id']))
In [84]:
# store the games' titles as a dictionary
games_titles = games.set_index('app_id')['title'].to_dict()
In [85]:
# check if the graph is bipartite
nx.is_bipartite(G)
Out[85]:
True

3. Graph Exploration¶

In [86]:
# Get the two node sets
game_nodes = {n for n, d in G.nodes(data=True) if d['bipartite'] == 0}
user_nodes = set(G) - game_nodes

# Basic metrics
print("Number of nodes:", G.number_of_nodes())
print("Number of edges:", G.number_of_edges())
print("Average degree:", sum(dict(G.degree()).values()) / G.number_of_nodes())
Number of nodes: 12845
Number of edges: 10000
Average degree: 1.5570260801868432
In [87]:
# Projected graphs
# Project onto game nodes (game-game projection)
game_projection = bipartite.weighted_projected_graph(G, game_nodes)
print("Game-Game Projection - Number of nodes:", game_projection.number_of_nodes())
print("Game-Game Projection - Number of edges:", game_projection.number_of_edges())

# Project onto user nodes (user-user projection)
user_projection = bipartite.weighted_projected_graph(G, user_nodes)
print("User-User Projection - Number of nodes:", user_projection.number_of_nodes())
print("User-User Projection - Number of edges:", user_projection.number_of_edges())

# Centrality analysis (on original graph)
game_centrality = bipartite.degree_centrality(G, game_nodes)
user_centrality = bipartite.degree_centrality(G, user_nodes)
print("Top 5 Games by Degree Centrality:", sorted(game_centrality.items(), key=lambda x: x[1], reverse=True)[:5])
print("Top 5 Users by Degree Centrality:", sorted(user_centrality.items(), key=lambda x: x[1], reverse=True)[:5])

# Clustering coefficient
game_clustering = nx.clustering(game_projection)
user_clustering = nx.clustering(user_projection)
print("Top 5 Games by Clustering Coefficient:", sorted(game_clustering.items(), key=lambda x: x[1], reverse=True)[:5])
print("Top 5 Users by Clustering Coefficient:", sorted(user_clustering.items(), key=lambda x: x[1], reverse=True)[:5])
Game-Game Projection - Number of nodes: 2867
Game-Game Projection - Number of edges: 22
User-User Projection - Number of nodes: 9978
User-User Projection - Number of edges: 69013
Top 5 Games by Degree Centrality: [(440, 0.009921828021647625), (252490, 0.0065143315293646024), (570, 0.005913008618961716), (1091500, 0.005712567648827421), (730, 0.005512126678693125)]
Top 5 Users by Degree Centrality: [(440, 0.009921828021647625), (252490, 0.0065143315293646024), (570, 0.005913008618961716), (1091500, 0.005712567648827421), (730, 0.005512126678693125)]
Top 5 Games by Clustering Coefficient: [(688130, 0), (65540, 0), (262150, 0), (868360, 0), (40970, 0)]
Top 5 Users by Clustering Coefficient: [('User_8649005', 1.0), ('User_11851626', 1.0), ('User_6946150', 1.0), ('User_13815829', 1.0), ('User_1372818', 1.0)]
In [88]:
# print the top 5 games titles and their degree centrality
top_5_games = sorted(game_centrality.items(), key=lambda x: x[1], reverse=True)[:5]
for game, centrality in top_5_games:
    print(f'{games_titles[game]}: {centrality:.4f}')
Team Fortress 2: 0.0099
Rust: 0.0065
Dota 2: 0.0059
Cyberpunk 2077: 0.0057
Counter-Strike: Global Offensive: 0.0055
In [89]:
# print the top 5 games and their clustering coefficient
top_5_games = sorted(game_clustering.items(), key=lambda x: x[1], reverse=True)[:5]
for game, centrality in top_5_games:
    print(f'{games_titles[game]}: {centrality:.4f}')
Pogostuck: Rage With Your Friends: 0.0000
Gothic 1: 0.0000
Vanguard Princess: 0.0000
Project Hospital: 0.0000
Stronghold Crusader HD: 0.0000
In [ ]:
# Community detection
from networkx.algorithms import community

communities = community.louvain_communities(G)
print("Number of detected communities:", len(communities))

# print the features of the largest community
largest_community = max(communities, key=len)
print("Number of nodes in largest community:", len(largest_community))

# print the games in the largest community
print("Games in largest community:")
for node in largest_community:
    if node in game_nodes:
        print(games_titles[node])
Number of detected communities: 2845
Number of nodes in largest community: 100
Games in largest community:
Team Fortress 2

4. Link Prediction¶

In [91]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
import random
# Prepare Data for Link Prediction
# Extract existing edges and generate non-edges from the user projection graph
edges = list(user_projection.edges())
non_edges = list(nx.non_edges(user_projection))

# Split into train and test sets
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)
train_non_edges, test_non_edges = train_test_split(non_edges, test_size=0.2, random_state=42)

# Feature Extraction
def generate_features(graph, edges, label):
    features = []
    for u, v in edges:
        # Jaccard similarity
        jaccard = list(nx.jaccard_coefficient(graph, [(u, v)]))[0][2]

        # Common neighbors
        common_neighbors = len(list(nx.common_neighbors(graph, u, v)))

        # Preferential attachment
        preferential_attachment = list(nx.preferential_attachment(graph, [(u, v)]))[0][2]

        features.append([jaccard, common_neighbors, preferential_attachment, label])
    return features

# Generate training and testing datasets
train_positive = generate_features(user_projection, train_edges, 1)
train_negative = generate_features(user_projection, random.sample(train_non_edges, len(train_edges)), 0)
test_positive = generate_features(user_projection, test_edges, 1)
test_negative = generate_features(user_projection, random.sample(test_non_edges, len(test_edges)), 0)

# Combine datasets
columns = ['jaccard', 'common_neighbors', 'preferential_attachment', 'label']
train_data = pd.DataFrame(train_positive + train_negative, columns=columns)
test_data = pd.DataFrame(test_positive + test_negative, columns=columns)

# Train Link Prediction Model
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']
X_test = test_data.drop('label', axis=1)
y_test = test_data['label']

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Accuracy: {accuracy}")
print(f"ROC AUC Score: {roc_auc}")
Accuracy: 0.997174527276679
ROC AUC Score: 0.9999768689112704
In [93]:
predicted_links = [(u, v) for (u, v), pred in zip(test_edges, y_pred) if pred == 1]
In [100]:
def recommend_top_games_by_hours(graph, user, predicted_user, game_hours, top_n=1):
    """
    Recommend the top-n games by hours played by the predicted user that the original user hasn't played.

    Parameters:
        graph (networkx.Graph): The original bipartite graph.
        user (str): The user to recommend games for.
        predicted_user (str): The user connected via a predicted link.
        game_hours (pd.Series): A mapping of game IDs to their total hours played.
        top_n (int): Number of top games to recommend.

    Returns:
        List of recommended games.
    """
    # Games played by each user
    games_user = set(graph.neighbors(user))
    games_predicted_user = set(graph.neighbors(predicted_user))
    
    # Find games played by the predicted user but not the original user
    unplayed_games = games_predicted_user - games_user
    
    # Rank games by hours played
    ranked_games = sorted(
        unplayed_games,
        key=lambda game: game_hours.get(game, 0),  # Default hours played to 0 if missing
        reverse=True
    )
    
    return ranked_games[:top_n]

# Get the total hours played for each game
game_hours = sample_user_reviews[['app_id', 'hours']].set_index('app_id')['hours']

# Generate recommendations for each predicted link
top_n = 1
recommendations = {}

for u, v in predicted_links:
    # Recommend for user u
    rec_u = recommend_top_games_by_hours(G, u, v, game_hours, top_n)
    if rec_u:
        recommendations[u] = rec_u
    
    # Recommend for user v
    rec_v = recommend_top_games_by_hours(G, v, u, game_hours, top_n)
    if rec_v:
        recommendations[v] = rec_v

# Display recommendations with game titles
for user, recs in recommendations.items():
    print(f"Recommendations for {user}:")
    for i, game in enumerate(recs, 1):
        print(f"{games_titles[game]}")
    print()
Recommendations for User_11528755:
Rebuild 3: Gangs of Deadsville

Recommendations for User_3169283:
Left 4 Dead 2

Recommendations for User_8216702:
The Forest

Recommendations for User_11314769:
Hurtworld

Recommendations for User_2515142:
Detroit: Become Human

Recommendations for User_3478715:
Warframe

Recommendations for User_12360999:
Hurtworld

Recommendations for User_2627399:
VRChat

Recommendations for User_8100846:
Rebuild 3: Gangs of Deadsville

Recommendations for User_4839232:
Oh...Sir!! The Insult Simulator

Recommendations for User_7278557:
VRChat

Recommendations for User_4428884:
Slime Rancher

Recommendations for User_4384432:
Dead Rising 3 Apocalypse Edition

Recommendations for User_1490364:
Left 4 Dead 2

Recommendations for User_3159603:
DARK SOULS™ II

Recommendations for User_6154000:
Oh...Sir!! The Insult Simulator

Recommendations for User_8751113:
Oh...Sir!! The Insult Simulator

Recommendations for User_12897038:
Knight Online

Recommendations for User_6845471:
Slime Rancher

Recommendations for User_1330371:
Detroit: Become Human

Recommendations for User_3215691:
Tomb Raider: Underworld

Recommendations for User_4354181:
KINGDOMS

Recommendations for User_9015003:
VRChat

Recommendations for User_3956653:
Detroit: Become Human

Recommendations for User_7220345:
Wizard And Minion Idle

Recommendations for User_14276710:
The Forest

Recommendations for User_1087238:
Oh...Sir!! The Insult Simulator

Recommendations for User_12988151:
Slime Rancher

Recommendations for User_8977050:
Detroit: Become Human

Recommendations for User_4041624:
Detroit: Become Human

Recommendations for User_2054864:
Detroit: Become Human

Recommendations for User_2479280:
Detroit: Become Human

Recommendations for User_5436587:
VRChat

Recommendations for User_14091926:
Oh...Sir!! The Insult Simulator

Recommendations for User_10801194:
Slime Rancher

Recommendations for User_4320885:
Dead Rising 3 Apocalypse Edition

Recommendations for User_11643239:
Oh...Sir!! The Insult Simulator

Recommendations for User_4647956:
VRChat

Recommendations for User_8056987:
The Forest

Recommendations for User_10291003:
The Forest

Recommendations for User_821258:
VRChat

Recommendations for User_6198361:
The Forest

Recommendations for User_12660619:
Slime Rancher

Recommendations for User_11794773:
Hyperdimension Neptunia Re;Birth1

Recommendations for User_5001327:
Slime Rancher

Recommendations for User_8266521:
VRChat

Recommendations for User_4561435:
Detroit: Become Human

Recommendations for User_7314572:
KINGDOMS

Recommendations for User_13849442:
Detroit: Become Human

Recommendations for User_3336928:
Marvel’s Spider-Man Remastered

Recommendations for User_9008925:
VRChat

Recommendations for User_6212642:
Wizard And Minion Idle

Recommendations for User_14253764:
Slime Rancher

Recommendations for User_2274600:
Slime Rancher

Recommendations for User_8728442:
The Forest

Recommendations for User_444465:
Detroit: Become Human

Recommendations for User_5524561:
Hyperdimension Neptunia Re;Birth1

Recommendations for User_3797421:
Siege Survival: Gloria Victis

Recommendations for User_4855471:
Slime Rancher