In [172]:
from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from textblob import TextBlob
import re
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
from wordcloud import WordCloud
In [173]:
# Load the data
df = pd.read_csv("nepal_tourist_reviews.csv")
In [174]:
# Basic data exploration
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())
Dataset Shape: (7271, 4) Columns: ['ID', 'location', 'total review', 'review'] Missing Values: ID 0 location 0 total review 38 review 169 dtype: int64
Data Cleaning & Preprocessing¶
In [175]:
# Drop missing values
df.dropna(inplace=True)
In [176]:
# Rename 'total review' to 'total_review'
df.rename(columns={"total review": "total_review"}, inplace=True)
In [177]:
# Print the data frame to check the changes
df.head()
Out[177]:
ID | location | total_review | review | |
---|---|---|---|---|
0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... |
1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... |
2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... |
3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... |
4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... |
In [178]:
# Check the missing values
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
Dataset Shape: (7102, 4) Missing Values: ID 0 location 0 total_review 0 review 0 dtype: int64
In [179]:
# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
# Lowercase and tokenize
words = re.findall(r"\b\w+\b", text.lower())
# Remove stop words
filtered_words = [word for word in words if word not in stop_words]
return " ".join(filtered_words)
# Apply the preprocessing function to the reviews
df["cleaned_review"] = df["review"].apply(preprocess_text)
[nltk_data] Downloading package stopwords to [nltk_data] /Users/anupamabhatta/nltk_data... [nltk_data] Package stopwords is already up-to-date!
In [180]:
# Print the data frame to check the changes
df.head()
Out[180]:
ID | location | total_review | review | cleaned_review | |
---|---|---|---|---|---|
0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... |
1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... |
2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... |
3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... |
4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... |
Exploratory Data Analysis (EDA)¶
In [181]:
# Clean the 'total_review' column - convert to string, extract numbers and handle NaN values
df["total_review_count"] = pd.to_numeric(
df["total_review"].astype(str).str.extract(r"(\d+)")[0],
errors="coerce", # This will convert errors to NaN
)
# Basic statistics of review counts
print("\nReview Count Statistics:")
print(df["total_review_count"].describe())
Review Count Statistics: count 7102.000000 mean 80.566742 std 119.807382 min 1.000000 25% 13.000000 50% 38.000000 75% 94.000000 max 988.000000 Name: total_review_count, dtype: float64
In [182]:
# Location analysis
print("\nNumber of Unique Locations:", df["location"].nunique())
print("\nTop 10 Locations by Frequency:")
print(df["location"].value_counts())
Number of Unique Locations: 10 Top 10 Locations by Frequency: location Pokhara 936 Lumbini 935 Syambhunath 930 Bhaktapur Durbar Square 930 Chitwan National Park 930 Pasupatinath Temple 930 Langtang 673 Bardiya National Park 595 Annapurna Base Camp 132 Everest Base Camp Trek 111 Name: count, dtype: int64
In [183]:
# Let's also look at the distribution of reviews per location
location_review_stats = (
df.groupby("location")
.agg(
{
"ID": "count", # Number of reviews per location
"total_review_count": [
"mean",
"min",
"max",
], # Statistics of the 'total review' numbers
}
)
.round(2)
)
print("\nLocation Statistics:")
print(location_review_stats)
Location Statistics: ID total_review_count count mean min max location Annapurna Base Camp 132 95.93 1 834 Bardiya National Park 595 73.95 1 874 Bhaktapur Durbar Square 930 81.09 1 917 Chitwan National Park 930 82.14 1 897 Everest Base Camp Trek 111 90.84 1 674 Langtang 673 77.23 1 916 Lumbini 935 90.99 1 917 Pasupatinath Temple 930 70.68 1 936 Pokhara 936 72.19 1 908 Syambhunath 930 89.55 1 988
In [184]:
# Print the data frame to check the changes
df.head()
Out[184]:
ID | location | total_review | review | cleaned_review | total_review_count | |
---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 |
1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 |
2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 |
3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 |
4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 |
EDA Visualization¶
In [185]:
# Distribution of total review counts per location
plt.figure(figsize=(10, 6))
sns.boxplot(
data=df,
x="location",
y="total_review_count",
hue="location",
palette="viridis",
legend=False,
)
plt.title("Distribution of Total Reviews per Location", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Total Review Count", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [186]:
# Bar chart of top locations by review frequency
plt.figure(figsize=(10, 6))
sns.barplot(
x=df["location"].value_counts().index,
y=df["location"].value_counts().values,
hue=df["location"].value_counts().index,
palette="coolwarm",
legend=False,
)
plt.title("Top Locations by Number of Reviews", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Number of Reviews", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [187]:
# Histogram of total review counts
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="total_review_count", bins=30, kde=True, color="skyblue")
plt.title("Distribution of Total Review Counts", fontsize=14)
plt.xlabel("Total Review Count", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()
In [188]:
# Tokenize reviews and remove stopwords
stop_words = set(stopwords.words("english"))
df["review_tokens"] = (
df["cleaned_review"]
.dropna()
.apply(
lambda x: [
word.lower()
for word in word_tokenize(x)
if word.isalpha() and word.lower() not in stop_words
]
)
)
# Flatten the list of tokens to count word frequencies
all_tokens = [word for tokens in df["review_tokens"].dropna() for word in tokens]
word_counts = Counter(all_tokens)
# Top 30 most common words
common_words = word_counts.most_common(30)
print("\nTop 30 Most Common Words:", common_words)
# Visualize word frequency
words, counts = zip(*common_words)
plt.figure(figsize=(10, 6))
sns.barplot(
x=list(words), y=list(counts), hue=list(words), palette="tab10", legend=False
)
plt.title("Top 30 Most Common Words in Reviews", fontsize=14)
plt.xlabel("Word", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Top 30 Most Common Words: [('place', 4077), ('visit', 1353), ('temple', 1284), ('nepal', 1131), ('one', 1116), ('beautiful', 1084), ('kathmandu', 812), ('best', 771), ('good', 687), ('park', 648), ('nice', 585), ('see', 582), ('great', 565), ('world', 563), ('amazing', 539), ('view', 516), ('national', 510), ('heritage', 487), ('must', 479), ('site', 462), ('buddha', 451), ('also', 446), ('lake', 432), ('time', 408), ('many', 400), ('lord', 396), ('around', 365), ('peaceful', 356), ('people', 351), ('valley', 338)]
In [189]:
# Print the data frame to check the changes
df.head()
Out[189]:
ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | |
---|---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... |
1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... |
2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... |
3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... |
4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... |
ML Model Training¶
In [190]:
class TourismSentimentClassifier:
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=5000, ngram_range=(1, 2), stop_words="english"
)
# Initialize all classifiers
self.classifiers = {
"naive_bayes": MultinomialNB(),
"svm": LinearSVC(random_state=42),
"logistic_regression": LogisticRegression(random_state=42),
"random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
"knn": KNeighborsClassifier(n_neighbors=5),
}
self.trained_models = {}
self.best_model = None
self.best_model_name = None
# Prepare initial labels using lexicon-based approach
def prepare_initial_labels(self, df):
analyzer = SentimentIntensityAnalyzer()
def get_initial_label(text):
scores = analyzer.polarity_scores(text)
if scores["compound"] >= 0.5:
return 2 # very positive
elif scores["compound"] >= 0.1:
return 1 # positive
elif scores["compound"] <= -0.5:
return -2 # very negative
elif scores["compound"] <= -0.1:
return -1 # negative
return 0 # neutral
return df["cleaned_review"].apply(get_initial_label)
# Train and evaluate all classifiers
def train_and_evaluate(self, df):
# Prepare initial labels
y = self.prepare_initial_labels(df)
X = df["cleaned_review"]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train and evaluate each classifier
results = {}
for name, classifier in self.classifiers.items():
# Create pipeline
pipeline = Pipeline(
[("vectorizer", self.vectorizer), ("classifier", classifier)]
)
# Train
pipeline.fit(X_train, y_train)
# Evaluate
score = pipeline.score(X_test, y_test)
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred)
# Store results
results[name] = {"score": score, "report": report, "model": pipeline}
self.trained_models[name] = pipeline
# Find best model
best_score = -1
for name, result in results.items():
if result["score"] > best_score:
best_score = result["score"]
self.best_model = result["model"]
self.best_model_name = name
return results
# Make predictions using the best model
def predict(self, text):
if self.best_model is None:
raise ValueError("Models haven't been trained yet!")
return self.best_model.predict([text])[0]
# Make predictions using ensemble of all models
def ensemble_predict(self, text):
if not self.trained_models:
raise ValueError("Models haven't been trained yet!")
predictions = []
for model in self.trained_models.values():
pred = model.predict([text])[0]
predictions.append(pred)
return max(set(predictions), key=predictions.count)
In [191]:
# Comprehensive analysis of reviews using both ML and lexicon-based approaches
def analyze_reviews(df):
# Initialize and train ML classifiers
classifier = TourismSentimentClassifier()
results = classifier.train_and_evaluate(df)
# Add ML predictions to dataframe
df["ml_sentiment"] = df["cleaned_review"].apply(classifier.predict)
df["ensemble_sentiment"] = df["cleaned_review"].apply(classifier.ensemble_predict)
# Add lexicon-based predictions (from previous implementation)
analyzer = SentimentIntensityAnalyzer()
df["lexicon_sentiment"] = df["cleaned_review"].apply(
lambda x: analyzer.polarity_scores(x)["compound"]
)
# Combine predictions
def get_final_sentiment(row):
ml_sent = row["ml_sentiment"]
lex_sent = row["lexicon_sentiment"]
ensemble_sent = row["ensemble_sentiment"]
# Weight and combine different approaches
if abs(ml_sent - ensemble_sent) <= 1: # ML models agree
return ml_sent
else: # Use lexicon as tiebreaker
return (
2
if lex_sent >= 0.5
else 1
if lex_sent >= 0.1
else -2
if lex_sent <= -0.5
else -1
if lex_sent <= -0.1
else 0
)
df["final_sentiment"] = df.apply(get_final_sentiment, axis=1)
return df, results
In [192]:
# Get insights about model performance
def get_model_insights(results):
"""
Get insights about model performance
"""
insights = {"best_model": None, "model_comparison": {}, "feature_importance": {}}
best_score = -1
for name, result in results.items():
score = result["score"]
insights["model_comparison"][name] = {
"accuracy": score,
"detailed_report": result["report"],
}
if score > best_score:
best_score = score
insights["best_model"] = name
return insights
In [193]:
# Analyze reviews using both ML and lexicon-based approaches
df, model_results = analyze_reviews(df)
# Get insights about model performance
insights = get_model_insights(model_results)
print(f"Best performing model: {insights['best_model']}")
for model, metrics in insights["model_comparison"].items():
print(f"\n{model} accuracy: {metrics['accuracy']:.3f}")
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_classes.py:31: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn(
Best performing model: svm naive_bayes accuracy: 0.685 svm accuracy: 0.820 logistic_regression accuracy: 0.787 random_forest accuracy: 0.807 knn accuracy: 0.331
In [194]:
# Print the data frame to check the changes
df.head()
Out[194]:
ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | final_sentiment | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | 2 |
1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | 2 |
2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | 2 |
3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | 2 |
4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | 2 |
ML Model Tuning & Testing¶
In [195]:
class TourismSentimentAnalyzer:
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2),
stop_words="english",
min_df=5, # Ignore terms that appear in less than 5 documents
)
# Enhanced SVM parameters
self.classifier = LinearSVC(
dual=False,
random_state=42,
class_weight="balanced",
C=1.0, # Regularization parameter
tol=1e-4, # Tolerance for stopping criterion
)
self.scaler = MinMaxScaler()
def fit(self, X, y):
"""Train the model with given data"""
X_transformed = self.vectorizer.fit_transform(X)
self.classifier.fit(X_transformed, y)
return self
def predict(self, X):
"""Predict sentiment for new data"""
X_transformed = self.vectorizer.transform(X)
return self.classifier.predict(X_transformed)
def improved_preprocess_data(df):
"""
Enhanced preprocessing pipeline with improved SVM-based sentiment analysis
"""
# Handle warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Extract numeric part from 'total_review' and convert to float
df["total_review"] = df["total_review"].str.extract(r"(\d+)").astype(float)
# Calculate trust scores
scaler = MinMaxScaler()
df["trust_score"] = scaler.fit_transform(df[["total_review"]])
# Calculate sentiment polarity using TextBlob
df["sentiment_polarity"] = df["cleaned_review"].apply(
lambda x: TextBlob(str(x)).sentiment.polarity
)
# Initialize and train the sentiment analyzer
analyzer = TourismSentimentAnalyzer()
# Create initial labels for training using TextBlob
initial_sentiments = df["cleaned_review"].apply(
lambda x: 1
if TextBlob(str(x)).sentiment.polarity > 0.2
else -1
if TextBlob(str(x)).sentiment.polarity < -0.2
else 0
)
# Train the SVM model
analyzer.fit(df["cleaned_review"], initial_sentiments)
# Apply ML-based sentiment analysis
df["ml_sentiment_score"] = analyzer.predict(df["cleaned_review"])
# Convert ML scores to final classification (positive/negative/neutral)
df["review_class"] = df["ml_sentiment_score"].map(
{1: "positive", 0: "neutral", -1: "negative"}
)
return df
# Function to apply the sentiment analysis to your main DataFrame
def apply_sentiment_analysis(df):
"""
Apply the improved sentiment analysis to the main DataFrame
"""
# Make sure the DataFrame has the required columns
if "cleaned_review" not in df.columns:
raise ValueError("DataFrame must contain 'cleaned_review' column")
# Apply the improved preprocessing
processed_df = improved_preprocess_data(df)
return processed_df
In [196]:
# Apply the sentiment analysis
df = apply_sentiment_analysis(df)
# Now df contains:
# - trust_score: normalized score based on total_review
# - ml_sentiment_score: raw SVM predictions
# - review_class: final classification as "positive", "negative", or "neutral"
# Display the distribution of sentiment classes
print("\nSentiment Distribution:")
print(df["review_class"].value_counts())
Sentiment Distribution: review_class positive 4776 neutral 2229 negative 97 Name: count, dtype: int64
In [197]:
# Print the data frame to check the changes
df.head()
Out[197]:
ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | final_sentiment | trust_score | sentiment_polarity | ml_sentiment_score | review_class | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | 2 | 0.045593 | 0.700000 | 1 | positive |
1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | 2 | 0.132725 | 0.329167 | 1 | positive |
2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | 2 | 0.300912 | 0.364583 | 1 | positive |
3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | 2 | 0.249240 | 0.687500 | 1 | positive |
4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | 2 | 0.068896 | 0.500000 | 1 | positive |
Pattern Mining¶
In [198]:
def extract_patterns(df):
# Create aspects dictionary
aspects = {
"accessibility": [
"accessibility",
"transport",
"transportation",
"road",
"roads",
"access",
"reach",
"distance",
],
"accommodation": [
"hotel",
"bed",
"room",
"stay",
"house",
"accommodation",
"garden",
],
"culture": [
"temple",
"culture",
"tradition",
"heritage",
"Buddha",
"Buddhism",
"Buddhist",
"Hindu",
"Hinduism",
],
"food": ["food", "restaurant", "breakfast", "dinner", "delicious", "cuisine"],
"nature": [
"mountain",
"mountains",
"view",
"lake",
"scenery",
"landscape",
"nature",
"heaven",
"Himalayas",
"Everest",
"Sagarmatha",
],
"religion": [
"temple",
"monastery",
"stupa",
"Stupa",
"Spirituality",
"pagoda",
"shrine",
"religion",
"religious",
"monk",
"nun",
"priest",
],
"safety": [
"safe",
"security",
"dangerous",
"risk",
"risky",
"safety",
"secure",
"alert",
"crime",
"scam",
"fraud",
],
}
# Extract aspects from reviews (similar to association rule mining)
for aspect, keywords in aspects.items():
df[aspect] = (
df["cleaned_review"]
.fillna("")
.apply(lambda x: any(word in str(x).lower() for word in keywords))
)
return df
# Extract patterns from reviews
df = extract_patterns(df)
df
Out[198]:
ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | sentiment_polarity | ml_sentiment_score | review_class | accessibility | accommodation | culture | food | nature | religion | safety | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | 0.700000 | 1 | positive | False | False | False | False | True | False | False |
1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | 0.329167 | 1 | positive | False | False | False | False | False | True | False |
2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | 0.364583 | 1 | positive | False | False | True | False | True | True | False |
3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | 0.687500 | 1 | positive | False | False | True | False | False | True | False |
4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | 0.500000 | 1 | positive | False | False | False | False | True | True | False |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7266 | 7267 | Pokhara | 9.0 | It's a nice place to sit back, and enjoy. The ... | nice place sit back enjoy calm fresh air lake ... | 9 | [nice, place, sit, back, enjoy, calm, fresh, a... | 2 | 2 | 0.9456 | ... | 0.312500 | 1 | positive | False | False | False | False | True | False | False |
7267 | 7268 | Pokhara | 3.0 | Excellent Place to visit, Lifetime memories | excellent place visit lifetime memories | 3 | [excellent, place, visit, lifetime, memories] | 2 | 2 | 0.5719 | ... | 1.000000 | 1 | positive | False | False | False | False | False | False | False |
7268 | 7269 | Pokhara | 79.0 | It's very photogenic and relaxing when there a... | photogenic relaxing many people | 79 | [photogenic, relaxing, many, people] | 1 | 1 | 0.4939 | ... | 0.500000 | 0 | neutral | False | False | False | False | False | False | False |
7269 | 7270 | Pokhara | 14.0 | U can get real definition of nature's beauty a... | u get real definition nature beauty peace | 14 | [u, get, real, definition, nature, beauty, peace] | 2 | 2 | 0.8074 | ... | 0.200000 | 0 | neutral | False | False | False | False | True | False | False |
7270 | 7271 | Pokhara | 58.0 | Best. Walking please beautiful views | best walking please beautiful views | 58 | [best, walking, please, beautiful, views] | 2 | 2 | 0.8860 | ... | 0.925000 | 1 | positive | False | False | False | False | True | False | False |
7102 rows × 22 columns
Similarity-based Analysis¶
In [199]:
def calculate_similarities(df):
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000, stop_words="english")
text_matrix = tfidf.fit_transform(df["cleaned_review"].fillna(""))
# PCA for dimension reduction
pca = PCA(n_components=2)
text_pca = pca.fit_transform(text_matrix.toarray())
df["pca1"] = text_pca[:, 0]
df["pca2"] = text_pca[:, 1]
return df, tfidf.get_feature_names_out()
# Calculate similarities using TF-IDF and PCA
df, feature_names = calculate_similarities(df)
df
Out[199]:
ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | review_class | accessibility | accommodation | culture | food | nature | religion | safety | pca1 | pca2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | positive | False | False | False | False | True | False | False | 0.086252 | -0.073357 |
1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | positive | False | False | False | False | False | True | False | 0.158235 | -0.086863 |
2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | positive | False | False | True | False | True | True | False | 0.039676 | -0.027877 |
3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | positive | False | False | True | False | False | True | False | -0.088295 | -0.026823 |
4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | positive | False | False | False | False | True | True | False | 0.080956 | -0.090689 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7266 | 7267 | Pokhara | 9.0 | It's a nice place to sit back, and enjoy. The ... | nice place sit back enjoy calm fresh air lake ... | 9 | [nice, place, sit, back, enjoy, calm, fresh, a... | 2 | 2 | 0.9456 | ... | positive | False | False | False | False | True | False | False | 0.035502 | 0.141770 |
7267 | 7268 | Pokhara | 3.0 | Excellent Place to visit, Lifetime memories | excellent place visit lifetime memories | 3 | [excellent, place, visit, lifetime, memories] | 2 | 2 | 0.5719 | ... | positive | False | False | False | False | False | False | False | 0.087738 | -0.041908 |
7268 | 7269 | Pokhara | 79.0 | It's very photogenic and relaxing when there a... | photogenic relaxing many people | 79 | [photogenic, relaxing, many, people] | 1 | 1 | 0.4939 | ... | neutral | False | False | False | False | False | False | False | -0.090147 | 0.020505 |
7269 | 7270 | Pokhara | 14.0 | U can get real definition of nature's beauty a... | u get real definition nature beauty peace | 14 | [u, get, real, definition, nature, beauty, peace] | 2 | 2 | 0.8074 | ... | neutral | False | False | False | False | True | False | False | -0.087003 | 0.007280 |
7270 | 7271 | Pokhara | 58.0 | Best. Walking please beautiful views | best walking please beautiful views | 58 | [best, walking, please, beautiful, views] | 2 | 2 | 0.8860 | ... | positive | False | False | False | False | True | False | False | 0.018753 | -0.067533 |
7102 rows × 24 columns
Network Analysis¶
In [200]:
def create_aspect_network(df):
G = nx.Graph()
aspects = [
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
# Create edges between aspects that commonly co-occur
for i in range(len(aspects)):
for j in range(i + 1, len(aspects)):
weight = df[df[aspects[i]] & df[aspects[j]]].shape[0]
if weight > 0:
G.add_edge(aspects[i], aspects[j], weight=weight)
return G
# Create aspect network
G = create_aspect_network(df)
G.edges(data=True)
# Display EdgeDataView in a table format
print("\nAspect Network:")
aspect = pd.DataFrame(G.edges(data=True), columns=["Aspect1", "Aspect2", "Weight"])
aspect
Aspect Network:
Out[200]:
Aspect1 | Aspect2 | Weight | |
---|---|---|---|
0 | accessibility | accommodation | {'weight': 9} |
1 | accessibility | culture | {'weight': 34} |
2 | accessibility | food | {'weight': 8} |
3 | accessibility | nature | {'weight': 46} |
4 | accessibility | religion | {'weight': 42} |
5 | accessibility | safety | {'weight': 2} |
6 | accommodation | culture | {'weight': 54} |
7 | accommodation | food | {'weight': 48} |
8 | accommodation | nature | {'weight': 66} |
9 | accommodation | religion | {'weight': 44} |
10 | accommodation | safety | {'weight': 3} |
11 | culture | food | {'weight': 63} |
12 | culture | nature | {'weight': 246} |
13 | culture | religion | {'weight': 1129} |
14 | culture | safety | {'weight': 9} |
15 | food | nature | {'weight': 112} |
16 | food | religion | {'weight': 44} |
17 | food | safety | {'weight': 4} |
18 | nature | religion | {'weight': 295} |
19 | nature | safety | {'weight': 14} |
20 | religion | safety | {'weight': 12} |
In [201]:
# Draw the aspect network
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)
nx.draw_networkx_nodes(G, pos, node_size=1000, node_color="skyblue")
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5, edge_color="gray")
nx.draw_networkx_labels(G, pos, font_size=12, font_color="black")
plt.title("Aspect Network", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.show()
Segmentation Analysis¶
In [202]:
def analyze_by_trust_segments(df):
df["trust_segment"] = pd.qcut(
df["trust_score"],
q=5,
labels=["Very Low", "Low", "Medium", "High", "Very High"],
)
segment_analysis = df.groupby("trust_segment", observed=False).agg(
{
"ml_sentiment_score": "mean",
"accessibility": "mean",
"accommodation": "mean",
"culture": "mean",
"food": "mean",
"nature": "mean",
"religion": "mean",
"safety": "mean",
}
)
return segment_analysis
# Analyze by trust score segments
segment_analysis = analyze_by_trust_segments(df)
segment_analysis
Out[202]:
ml_sentiment_score | accessibility | accommodation | culture | food | nature | religion | safety | |
---|---|---|---|---|---|---|---|---|
trust_segment | ||||||||
Very Low | 0.675746 | 0.014916 | 0.033074 | 0.151751 | 0.027886 | 0.167315 | 0.136187 | 0.007782 |
Low | 0.671111 | 0.018519 | 0.029630 | 0.215556 | 0.031111 | 0.194074 | 0.196296 | 0.008148 |
Medium | 0.690632 | 0.018155 | 0.031954 | 0.227306 | 0.040668 | 0.202614 | 0.227306 | 0.008715 |
High | 0.662429 | 0.026130 | 0.038136 | 0.247881 | 0.052966 | 0.199859 | 0.243644 | 0.007062 |
Very High | 0.594213 | 0.026817 | 0.044460 | 0.268878 | 0.035992 | 0.204658 | 0.255469 | 0.011291 |
In [203]:
# Print the data frame to check the changes
df.head()
Out[203]:
ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | accessibility | accommodation | culture | food | nature | religion | safety | pca1 | pca2 | trust_segment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | False | False | False | False | True | False | False | 0.086252 | -0.073357 | Medium |
1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | False | False | False | False | False | True | False | 0.158235 | -0.086863 | Very High |
2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | False | False | True | False | True | True | False | 0.039676 | -0.027877 | Very High |
3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | False | False | True | False | False | True | False | -0.088295 | -0.026823 | Very High |
4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | False | False | False | False | True | True | False | 0.080956 | -0.090689 | High |
5 rows × 25 columns
In [204]:
# Generate insights
insights = {
"overall_sentiment": df["ml_sentiment_score"].mean(),
"positive_reviews": (df["review_class"] == "positive").mean(),
"top_locations": df.groupby("location")["ml_sentiment_score"]
.mean()
.sort_values(ascending=False),
"aspect_frequencies": df[
[
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
].mean(),
"trust_segment_analysis": segment_analysis,
}
# Print key insights
print(f"Overall sentiment: {insights['overall_sentiment']:.3f}")
print("\nPositive review percentage: {:.1%}".format(insights["positive_reviews"]))
Overall sentiment: 0.659 Positive review percentage: 67.2%
Aspect Based Sentiment Analysis¶
In [205]:
def investigate_negative_reviews(df):
# Filter for negative and neutral reviews
negative_reviews = df[df["review_class"] == "negative"]
# Analyze aspect mentions in negative/neutral reviews
aspect_issues = negative_reviews[
[
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
].mean()
# Analyze sentiment score distribution
sentiment_distribution = negative_reviews["ml_sentiment_score"].describe()
# Extract common complaint phrases
top_complaints = (
negative_reviews["cleaned_review"]
.str.lower()
.str.split()
.explode()
.value_counts()
.head(30)
)
return {
"aspect_issues": aspect_issues,
"sentiment_distribution": sentiment_distribution,
"top_complaints": top_complaints,
}
# Investigate negative reviews
negative_insights = investigate_negative_reviews(df)
print("\nNegative Review Insights:")
print("Aspect Issues:")
print(negative_insights["aspect_issues"])
print("\nSentiment Distribution:")
print(negative_insights["sentiment_distribution"])
print("\nTop Complaint Phrases:")
print(negative_insights["top_complaints"])
Negative Review Insights: Aspect Issues: accessibility 0.051546 accommodation 0.041237 culture 0.134021 food 0.020619 nature 0.144330 religion 0.134021 safety 0.000000 dtype: float64 Sentiment Distribution: count 97.0 mean -1.0 std 0.0 min -1.0 25% -1.0 50% -1.0 75% -1.0 max -1.0 Name: ml_sentiment_score, dtype: float64 Top Complaint Phrases: cleaned_review place 33 base 29 camp 27 everest 14 visit 10 nepal 10 see 10 annapurna 9 trek 9 temple 9 least 8 elephants 7 life 7 time 7 go 7 one 7 dirty 7 even 7 expensive 6 like 6 park 6 must 6 get 6 jungle 6 take 5 earthquake 5 walk 5 never 5 guide 5 safari 5 Name: count, dtype: int64
In [206]:
def plot_aspect_issues(aspect_issues):
plt.figure(figsize=(10, 6))
sns.barplot(
x=aspect_issues.index,
y=aspect_issues.values,
hue=aspect_issues.index,
palette="viridis",
legend=False,
)
plt.title("Aspect Issues in Negative Reviews", fontsize=16)
plt.ylabel("Average Mention Rate", fontsize=12)
plt.xlabel("Aspects", fontsize=12)
plt.show()
# Plot aspect issues in negative reviews
plot_aspect_issues(negative_insights["aspect_issues"])
In [207]:
def plot_wordcloud(top_complaints):
wordcloud = WordCloud(
width=800, height=400, background_color="white"
).generate_from_frequencies(top_complaints.to_dict())
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Top Complaint Phrases", fontsize=16)
plt.show()
# Plot word cloud of top complaint phrases
plot_wordcloud(negative_insights["top_complaints"])
In [208]:
def generate_positive_insights(df):
# Filter for positive reviews
positive_reviews = df[df["review_class"] == "positive"]
# Extract common positive phrases
positive_phrases = (
positive_reviews["cleaned_review"]
.str.lower()
.str.split()
.explode()
.value_counts()
.head(20)
)
return positive_phrases
# Generate positive review insights
positive_phrases = generate_positive_insights(df)
# Plot word cloud of positive phrases
def plot_wordcloud_positive(positive_phrases):
wordcloud = WordCloud(
width=800, height=400, background_color="white", colormap="viridis"
).generate_from_frequencies(positive_phrases.to_dict())
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Positive Phrases Word Cloud", fontsize=16)
plt.show()
plot_wordcloud_positive(positive_phrases)
In [209]:
def segment_reviews_by_type(df, tourist_types):
segmented_data = {}
for tourist_type, keywords in tourist_types.items():
# Filter reviews containing keywords for this tourist type
mask = df["cleaned_review"].str.contains(
"|".join(keywords), case=False, na=False
)
type_reviews = df[mask]
# Calculate sentiment and aspect mentions for this type
sentiment = type_reviews["ml_sentiment_score"].mean()
aspect_issues = type_reviews[
[
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
].mean()
# Store results
segmented_data[tourist_type] = {
"review_count": type_reviews.shape[0],
"average_sentiment": sentiment,
"aspect_issues": aspect_issues,
}
return segmented_data
# Define tourist types and associated keywords
tourist_types = {
"adventure": ["trek", "hike", "climb", "adventure"],
"cultural": ["temple", "heritage", "festival", "culture"],
"religious": ["pilgrimage", "sacred", "shrine", "god", "prayer"],
}
tourist_segment_analysis = segment_reviews_by_type(df, tourist_types)
print("\nTourist Segment Analysis:")
for type_, data in tourist_segment_analysis.items():
print(f"{type_} - Average Sentiment: {data['average_sentiment']:.3f}")
print(f"Aspect Issues:\n{data['aspect_issues']}")
Tourist Segment Analysis: adventure - Average Sentiment: 0.711 Aspect Issues: accessibility 0.084986 accommodation 0.062323 culture 0.076487 food 0.056657 nature 0.382436 religion 0.093484 safety 0.016997 dtype: float64 cultural - Average Sentiment: 0.595 Aspect Issues: accessibility 0.021879 accommodation 0.035393 culture 0.992921 food 0.041184 nature 0.157658 religion 0.726512 safety 0.005792 dtype: float64 religious - Average Sentiment: 0.439 Aspect Issues: accessibility 0.022305 accommodation 0.074349 culture 0.591078 food 0.007435 nature 0.115242 religion 0.680297 safety 0.000000 dtype: float64
In [210]:
# Prepare data for visualization
tourist_types = list(tourist_segment_analysis.keys())
review_counts = [data["review_count"] for data in tourist_segment_analysis.values()]
average_sentiments = [
data["average_sentiment"] for data in tourist_segment_analysis.values()
]
aspect_issues_df = pd.DataFrame(
[data["aspect_issues"] for data in tourist_segment_analysis.values()],
index=tourist_types,
)
# Bar chart for Average Sentiment Scores
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, average_sentiments, color=["teal", "orange", "purple"])
plt.title("Average Sentiment by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Average Sentiment", fontsize=12)
plt.ylim(0, 1) # Sentiment is likely normalized between 0 and 1
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
In [211]:
# Heatmap for Aspect Issues
plt.figure(figsize=(10, 6))
sns.heatmap(
aspect_issues_df,
annot=True,
cmap="YlGnBu",
fmt=".2f",
cbar_kws={"label": "Mean Value"},
)
plt.title("Aspect Issues Across Tourist Types", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Tourist Type", fontsize=12)
plt.xticks(rotation=45)
plt.show()
In [212]:
# Bar chart for Review Counts
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, review_counts, color=["skyblue", "salmon", "lightgreen"])
plt.title("Review Counts by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Review Count", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
In [213]:
def compare_regions(df):
# Group by location
location_stats = (
df.groupby("location")
.agg(
{
"ml_sentiment_score": ["mean", "std"],
"accessibility": "mean",
"accommodation": "mean",
"culture": "mean",
"food": "mean",
"nature": "mean",
"religion": "mean",
"safety": "mean",
}
)
.sort_values(("ml_sentiment_score", "mean"), ascending=False)
)
# Identify top complaints for each location
complaint_keywords = df.groupby("location")["cleaned_review"].apply(
lambda texts: texts.str.lower().str.split().explode().value_counts().head(5)
)
return location_stats, complaint_keywords
region_stats, location_complaints = compare_regions(df)
print("\nRegional Sentiment Comparison:")
print(region_stats)
print("\nTop Complaints by Region:")
print(location_complaints)
Regional Sentiment Comparison: ml_sentiment_score accessibility \ mean std mean location Pokhara 0.830128 0.405827 0.017094 Syambhunath 0.727957 0.454819 0.046237 Chitwan National Park 0.723656 0.477682 0.018280 Langtang 0.704309 0.475841 0.031204 Bardiya National Park 0.680672 0.480824 0.018487 Bhaktapur Durbar Square 0.664516 0.490303 0.010753 Pasupatinath Temple 0.540860 0.529991 0.007527 Annapurna Base Camp 0.484848 0.682215 0.030303 Lumbini 0.473797 0.518510 0.009626 Everest Base Camp Trek 0.405405 0.692537 0.090090 accommodation culture food nature religion \ mean mean mean mean mean location Pokhara 0.051282 0.071581 0.141026 0.533120 0.073718 Syambhunath 0.012903 0.354839 0.019355 0.366667 0.541935 Chitwan National Park 0.064516 0.079570 0.019355 0.154839 0.016129 Langtang 0.029718 0.026746 0.016345 0.239227 0.002972 Bardiya National Park 0.045378 0.013445 0.008403 0.097479 0.006723 Bhaktapur Durbar Square 0.027957 0.411828 0.078495 0.030108 0.191398 Pasupatinath Temple 0.013978 0.523656 0.002151 0.058065 0.550538 Annapurna Base Camp 0.022727 0.000000 0.022727 0.348485 0.000000 Lumbini 0.041711 0.217112 0.003209 0.019251 0.225668 Everest Base Camp Trek 0.036036 0.000000 0.018018 0.207207 0.000000 safety mean location Pokhara 0.010684 Syambhunath 0.006452 Chitwan National Park 0.013978 Langtang 0.005944 Bardiya National Park 0.011765 Bhaktapur Durbar Square 0.004301 Pasupatinath Temple 0.007527 Annapurna Base Camp 0.007576 Lumbini 0.007487 Everest Base Camp Trek 0.018018 Top Complaints by Region: location Annapurna Base Camp place 37 annapurna 35 amazing 25 trek 22 one 21 Bardiya National Park place 210 park 162 national 135 bardiya 77 good 71 Bhaktapur Durbar Square place 693 bhaktapur 306 square 250 visit 240 durbar 217 Chitwan National Park place 431 park 383 national 288 safari 248 one 231 Everest Base Camp Trek everest 31 base 29 camp 29 experience 22 place 20 Langtang place 216 beautiful 111 trekking 92 park 92 langtang 86 Lumbini place 718 buddha 411 temple 233 birth 232 lord 217 Pasupatinath Temple temple 624 place 477 visit 256 hindu 234 one 205 Pokhara place 641 lake 414 beautiful 257 pokhara 243 visit 177 Syambhunath place 634 kathmandu 478 temple 305 view 274 visit 240 Name: cleaned_review, dtype: int64
In [214]:
# Regional Sentiment Comparison
plt.figure(figsize=(10, 6))
sentiment_data = region_stats[("ml_sentiment_score", "mean")].sort_values()
sentiment_data.plot(kind="barh", color="skyblue", edgecolor="black")
plt.title("Mean Sentiment Scores by Region", fontsize=14)
plt.xlabel("Mean Sentiment Score", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()
In [215]:
# Aspect Metrics Heatmap
plt.figure(figsize=(12, 8))
aspect_data = region_stats.xs("mean", axis=1, level=1).iloc[
:, 1:
] # Exclude sentiment score
sns.heatmap(aspect_data, annot=True, cmap="coolwarm", cbar_kws={"label": "Mean Value"})
plt.title("Aspect Metrics by Region", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()
In [216]:
# Correlation between sentiment and aspects
correlations = region_stats.corr()["ml_sentiment_score"]["mean"]
print("Correlation between sentiment and aspects:\n", correlations)
Correlation between sentiment and aspects: ml_sentiment_score mean 1.000000 std -0.877118 accessibility mean -0.380292 accommodation mean 0.298009 culture mean 0.015219 food mean 0.559874 nature mean 0.423403 religion mean 0.009081 safety mean -0.214823 Name: mean, dtype: float64
In [217]:
df.describe()
Out[217]:
ID | total_review | total_review_count | ml_sentiment | ensemble_sentiment | lexicon_sentiment | final_sentiment | trust_score | sentiment_polarity | ml_sentiment_score | pca1 | pca2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7.102000e+03 | 7.102000e+03 |
mean | 3631.451563 | 80.566742 | 80.566742 | 1.415517 | 1.469023 | 0.527836 | 1.418051 | 0.080615 | 0.384124 | 0.658828 | 4.001930e-18 | 4.802316e-17 |
std | 2117.239644 | 119.807382 | 119.807382 | 0.881742 | 0.810323 | 0.347161 | 0.878782 | 0.121385 | 0.318223 | 0.502121 | 1.443194e-01 | 1.361941e-01 |
min | 1.000000 | 1.000000 | 1.000000 | -2.000000 | -2.000000 | -0.851900 | -2.000000 | 0.000000 | -1.000000 | -1.000000 | -2.904202e-01 | -4.222869e-01 |
25% | 1809.250000 | 13.000000 | 13.000000 | 1.000000 | 1.000000 | 0.340000 | 1.000000 | 0.012158 | 0.100000 | 0.000000 | -9.846385e-02 | -4.854880e-02 |
50% | 3584.500000 | 38.000000 | 38.000000 | 2.000000 | 2.000000 | 0.624900 | 2.000000 | 0.037487 | 0.386970 | 1.000000 | -3.502423e-02 | -4.703180e-03 |
75% | 5495.750000 | 94.000000 | 94.000000 | 2.000000 | 2.000000 | 0.812600 | 2.000000 | 0.094225 | 0.600000 | 1.000000 | 6.933070e-02 | 2.404595e-02 |
max | 7271.000000 | 988.000000 | 988.000000 | 2.000000 | 2.000000 | 0.989500 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 7.903685e-01 | 8.742405e-01 |
In [218]:
# Prepare data
correlation_values = correlations[1:] # Exclude "mean" which is always 1.0
aspects = correlation_values.index.get_level_values(0)
# Plot
plt.figure(figsize=(10, 6))
correlation_values.plot(kind="barh", color="steelblue", edgecolor="black")
plt.axvline(0, color="black", linewidth=0.8, linestyle="--")
plt.title("Correlation Between Sentiment and Aspects", fontsize=14)
plt.xlabel("Correlation Coefficient", fontsize=12)
plt.ylabel("Aspects", fontsize=12)
plt.tight_layout()
plt.show()
In [219]:
# Print the data frame to check the changes
df.head()
Out[219]:
ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | accessibility | accommodation | culture | food | nature | religion | safety | pca1 | pca2 | trust_segment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | False | False | False | False | True | False | False | 0.086252 | -0.073357 | Medium |
1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | False | False | False | False | False | True | False | 0.158235 | -0.086863 | Very High |
2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | False | False | True | False | True | True | False | 0.039676 | -0.027877 | Very High |
3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | False | False | True | False | False | True | False | -0.088295 | -0.026823 | Very High |
4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | False | False | False | False | True | True | False | 0.080956 | -0.090689 | High |
5 rows × 25 columns