In [172]:
from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from textblob import TextBlob
import re
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
from wordcloud import WordCloud
In [173]:
# Load the data
df = pd.read_csv("nepal_tourist_reviews.csv")
In [174]:
# Basic data exploration
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())
Dataset Shape: (7271, 4) Columns: ['ID', 'location', 'total review', 'review'] Missing Values: ID 0 location 0 total review 38 review 169 dtype: int64
Data Cleaning & Preprocessing¶
In [175]:
# Drop missing values
df.dropna(inplace=True)
In [176]:
# Rename 'total review' to 'total_review'
df.rename(columns={"total review": "total_review"}, inplace=True)
In [177]:
# Print the data frame to check the changes
df.head()
Out[177]:
| ID | location | total_review | review | |
|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... |
| 1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... |
| 2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... |
| 3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... |
| 4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... |
In [178]:
# Check the missing values
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
Dataset Shape: (7102, 4) Missing Values: ID 0 location 0 total_review 0 review 0 dtype: int64
In [179]:
# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
def preprocess_text(text):
# Lowercase and tokenize
words = re.findall(r"\b\w+\b", text.lower())
# Remove stop words
filtered_words = [word for word in words if word not in stop_words]
return " ".join(filtered_words)
# Apply the preprocessing function to the reviews
df["cleaned_review"] = df["review"].apply(preprocess_text)
[nltk_data] Downloading package stopwords to [nltk_data] /Users/anupamabhatta/nltk_data... [nltk_data] Package stopwords is already up-to-date!
In [180]:
# Print the data frame to check the changes
df.head()
Out[180]:
| ID | location | total_review | review | cleaned_review | |
|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... |
| 1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... |
| 2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... |
| 3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... |
| 4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... |
Exploratory Data Analysis (EDA)¶
In [181]:
# Clean the 'total_review' column - convert to string, extract numbers and handle NaN values
df["total_review_count"] = pd.to_numeric(
df["total_review"].astype(str).str.extract(r"(\d+)")[0],
errors="coerce", # This will convert errors to NaN
)
# Basic statistics of review counts
print("\nReview Count Statistics:")
print(df["total_review_count"].describe())
Review Count Statistics: count 7102.000000 mean 80.566742 std 119.807382 min 1.000000 25% 13.000000 50% 38.000000 75% 94.000000 max 988.000000 Name: total_review_count, dtype: float64
In [182]:
# Location analysis
print("\nNumber of Unique Locations:", df["location"].nunique())
print("\nTop 10 Locations by Frequency:")
print(df["location"].value_counts())
Number of Unique Locations: 10 Top 10 Locations by Frequency: location Pokhara 936 Lumbini 935 Syambhunath 930 Bhaktapur Durbar Square 930 Chitwan National Park 930 Pasupatinath Temple 930 Langtang 673 Bardiya National Park 595 Annapurna Base Camp 132 Everest Base Camp Trek 111 Name: count, dtype: int64
In [183]:
# Let's also look at the distribution of reviews per location
location_review_stats = (
df.groupby("location")
.agg(
{
"ID": "count", # Number of reviews per location
"total_review_count": [
"mean",
"min",
"max",
], # Statistics of the 'total review' numbers
}
)
.round(2)
)
print("\nLocation Statistics:")
print(location_review_stats)
Location Statistics:
ID total_review_count
count mean min max
location
Annapurna Base Camp 132 95.93 1 834
Bardiya National Park 595 73.95 1 874
Bhaktapur Durbar Square 930 81.09 1 917
Chitwan National Park 930 82.14 1 897
Everest Base Camp Trek 111 90.84 1 674
Langtang 673 77.23 1 916
Lumbini 935 90.99 1 917
Pasupatinath Temple 930 70.68 1 936
Pokhara 936 72.19 1 908
Syambhunath 930 89.55 1 988
In [184]:
# Print the data frame to check the changes
df.head()
Out[184]:
| ID | location | total_review | review | cleaned_review | total_review_count | |
|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 |
| 1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 |
| 2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 |
| 3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 |
| 4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 |
EDA Visualization¶
In [185]:
# Distribution of total review counts per location
plt.figure(figsize=(10, 6))
sns.boxplot(
data=df,
x="location",
y="total_review_count",
hue="location",
palette="viridis",
legend=False,
)
plt.title("Distribution of Total Reviews per Location", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Total Review Count", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [186]:
# Bar chart of top locations by review frequency
plt.figure(figsize=(10, 6))
sns.barplot(
x=df["location"].value_counts().index,
y=df["location"].value_counts().values,
hue=df["location"].value_counts().index,
palette="coolwarm",
legend=False,
)
plt.title("Top Locations by Number of Reviews", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Number of Reviews", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
In [187]:
# Histogram of total review counts
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="total_review_count", bins=30, kde=True, color="skyblue")
plt.title("Distribution of Total Review Counts", fontsize=14)
plt.xlabel("Total Review Count", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()
In [188]:
# Tokenize reviews and remove stopwords
stop_words = set(stopwords.words("english"))
df["review_tokens"] = (
df["cleaned_review"]
.dropna()
.apply(
lambda x: [
word.lower()
for word in word_tokenize(x)
if word.isalpha() and word.lower() not in stop_words
]
)
)
# Flatten the list of tokens to count word frequencies
all_tokens = [word for tokens in df["review_tokens"].dropna() for word in tokens]
word_counts = Counter(all_tokens)
# Top 30 most common words
common_words = word_counts.most_common(30)
print("\nTop 30 Most Common Words:", common_words)
# Visualize word frequency
words, counts = zip(*common_words)
plt.figure(figsize=(10, 6))
sns.barplot(
x=list(words), y=list(counts), hue=list(words), palette="tab10", legend=False
)
plt.title("Top 30 Most Common Words in Reviews", fontsize=14)
plt.xlabel("Word", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Top 30 Most Common Words: [('place', 4077), ('visit', 1353), ('temple', 1284), ('nepal', 1131), ('one', 1116), ('beautiful', 1084), ('kathmandu', 812), ('best', 771), ('good', 687), ('park', 648), ('nice', 585), ('see', 582), ('great', 565), ('world', 563), ('amazing', 539), ('view', 516), ('national', 510), ('heritage', 487), ('must', 479), ('site', 462), ('buddha', 451), ('also', 446), ('lake', 432), ('time', 408), ('many', 400), ('lord', 396), ('around', 365), ('peaceful', 356), ('people', 351), ('valley', 338)]
In [189]:
# Print the data frame to check the changes
df.head()
Out[189]:
| ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... |
| 1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... |
| 2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... |
| 3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... |
| 4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... |
ML Model Training¶
In [190]:
class TourismSentimentClassifier:
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=5000, ngram_range=(1, 2), stop_words="english"
)
# Initialize all classifiers
self.classifiers = {
"naive_bayes": MultinomialNB(),
"svm": LinearSVC(random_state=42),
"logistic_regression": LogisticRegression(random_state=42),
"random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
"knn": KNeighborsClassifier(n_neighbors=5),
}
self.trained_models = {}
self.best_model = None
self.best_model_name = None
# Prepare initial labels using lexicon-based approach
def prepare_initial_labels(self, df):
analyzer = SentimentIntensityAnalyzer()
def get_initial_label(text):
scores = analyzer.polarity_scores(text)
if scores["compound"] >= 0.5:
return 2 # very positive
elif scores["compound"] >= 0.1:
return 1 # positive
elif scores["compound"] <= -0.5:
return -2 # very negative
elif scores["compound"] <= -0.1:
return -1 # negative
return 0 # neutral
return df["cleaned_review"].apply(get_initial_label)
# Train and evaluate all classifiers
def train_and_evaluate(self, df):
# Prepare initial labels
y = self.prepare_initial_labels(df)
X = df["cleaned_review"]
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Train and evaluate each classifier
results = {}
for name, classifier in self.classifiers.items():
# Create pipeline
pipeline = Pipeline(
[("vectorizer", self.vectorizer), ("classifier", classifier)]
)
# Train
pipeline.fit(X_train, y_train)
# Evaluate
score = pipeline.score(X_test, y_test)
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred)
# Store results
results[name] = {"score": score, "report": report, "model": pipeline}
self.trained_models[name] = pipeline
# Find best model
best_score = -1
for name, result in results.items():
if result["score"] > best_score:
best_score = result["score"]
self.best_model = result["model"]
self.best_model_name = name
return results
# Make predictions using the best model
def predict(self, text):
if self.best_model is None:
raise ValueError("Models haven't been trained yet!")
return self.best_model.predict([text])[0]
# Make predictions using ensemble of all models
def ensemble_predict(self, text):
if not self.trained_models:
raise ValueError("Models haven't been trained yet!")
predictions = []
for model in self.trained_models.values():
pred = model.predict([text])[0]
predictions.append(pred)
return max(set(predictions), key=predictions.count)
In [191]:
# Comprehensive analysis of reviews using both ML and lexicon-based approaches
def analyze_reviews(df):
# Initialize and train ML classifiers
classifier = TourismSentimentClassifier()
results = classifier.train_and_evaluate(df)
# Add ML predictions to dataframe
df["ml_sentiment"] = df["cleaned_review"].apply(classifier.predict)
df["ensemble_sentiment"] = df["cleaned_review"].apply(classifier.ensemble_predict)
# Add lexicon-based predictions (from previous implementation)
analyzer = SentimentIntensityAnalyzer()
df["lexicon_sentiment"] = df["cleaned_review"].apply(
lambda x: analyzer.polarity_scores(x)["compound"]
)
# Combine predictions
def get_final_sentiment(row):
ml_sent = row["ml_sentiment"]
lex_sent = row["lexicon_sentiment"]
ensemble_sent = row["ensemble_sentiment"]
# Weight and combine different approaches
if abs(ml_sent - ensemble_sent) <= 1: # ML models agree
return ml_sent
else: # Use lexicon as tiebreaker
return (
2
if lex_sent >= 0.5
else 1
if lex_sent >= 0.1
else -2
if lex_sent <= -0.5
else -1
if lex_sent <= -0.1
else 0
)
df["final_sentiment"] = df.apply(get_final_sentiment, axis=1)
return df, results
In [192]:
# Get insights about model performance
def get_model_insights(results):
"""
Get insights about model performance
"""
insights = {"best_model": None, "model_comparison": {}, "feature_importance": {}}
best_score = -1
for name, result in results.items():
score = result["score"]
insights["model_comparison"][name] = {
"accuracy": score,
"detailed_report": result["report"],
}
if score > best_score:
best_score = score
insights["best_model"] = name
return insights
In [193]:
# Analyze reviews using both ML and lexicon-based approaches
df, model_results = analyze_reviews(df)
# Get insights about model performance
insights = get_model_insights(model_results)
print(f"Best performing model: {insights['best_model']}")
for model, metrics in insights["model_comparison"].items():
print(f"\n{model} accuracy: {metrics['accuracy']:.3f}")
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_classes.py:31: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning. warnings.warn(
Best performing model: svm naive_bayes accuracy: 0.685 svm accuracy: 0.820 logistic_regression accuracy: 0.787 random_forest accuracy: 0.807 knn accuracy: 0.331
In [194]:
# Print the data frame to check the changes
df.head()
Out[194]:
| ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | final_sentiment | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46 reviews | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | 2 |
| 1 | 2 | Syambhunath | 132 reviews | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | 2 |
| 2 | 3 | Syambhunath | 298 reviews | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | 2 |
| 3 | 4 | Syambhunath | 247 reviews | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | 2 |
| 4 | 5 | Syambhunath | 69 reviews | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | 2 |
ML Model Tuning & Testing¶
In [195]:
class TourismSentimentAnalyzer:
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=5000,
ngram_range=(1, 2),
stop_words="english",
min_df=5, # Ignore terms that appear in less than 5 documents
)
# Enhanced SVM parameters
self.classifier = LinearSVC(
dual=False,
random_state=42,
class_weight="balanced",
C=1.0, # Regularization parameter
tol=1e-4, # Tolerance for stopping criterion
)
self.scaler = MinMaxScaler()
def fit(self, X, y):
"""Train the model with given data"""
X_transformed = self.vectorizer.fit_transform(X)
self.classifier.fit(X_transformed, y)
return self
def predict(self, X):
"""Predict sentiment for new data"""
X_transformed = self.vectorizer.transform(X)
return self.classifier.predict(X_transformed)
def improved_preprocess_data(df):
"""
Enhanced preprocessing pipeline with improved SVM-based sentiment analysis
"""
# Handle warnings
warnings.filterwarnings("ignore", category=UserWarning)
# Extract numeric part from 'total_review' and convert to float
df["total_review"] = df["total_review"].str.extract(r"(\d+)").astype(float)
# Calculate trust scores
scaler = MinMaxScaler()
df["trust_score"] = scaler.fit_transform(df[["total_review"]])
# Calculate sentiment polarity using TextBlob
df["sentiment_polarity"] = df["cleaned_review"].apply(
lambda x: TextBlob(str(x)).sentiment.polarity
)
# Initialize and train the sentiment analyzer
analyzer = TourismSentimentAnalyzer()
# Create initial labels for training using TextBlob
initial_sentiments = df["cleaned_review"].apply(
lambda x: 1
if TextBlob(str(x)).sentiment.polarity > 0.2
else -1
if TextBlob(str(x)).sentiment.polarity < -0.2
else 0
)
# Train the SVM model
analyzer.fit(df["cleaned_review"], initial_sentiments)
# Apply ML-based sentiment analysis
df["ml_sentiment_score"] = analyzer.predict(df["cleaned_review"])
# Convert ML scores to final classification (positive/negative/neutral)
df["review_class"] = df["ml_sentiment_score"].map(
{1: "positive", 0: "neutral", -1: "negative"}
)
return df
# Function to apply the sentiment analysis to your main DataFrame
def apply_sentiment_analysis(df):
"""
Apply the improved sentiment analysis to the main DataFrame
"""
# Make sure the DataFrame has the required columns
if "cleaned_review" not in df.columns:
raise ValueError("DataFrame must contain 'cleaned_review' column")
# Apply the improved preprocessing
processed_df = improved_preprocess_data(df)
return processed_df
In [196]:
# Apply the sentiment analysis
df = apply_sentiment_analysis(df)
# Now df contains:
# - trust_score: normalized score based on total_review
# - ml_sentiment_score: raw SVM predictions
# - review_class: final classification as "positive", "negative", or "neutral"
# Display the distribution of sentiment classes
print("\nSentiment Distribution:")
print(df["review_class"].value_counts())
Sentiment Distribution: review_class positive 4776 neutral 2229 negative 97 Name: count, dtype: int64
In [197]:
# Print the data frame to check the changes
df.head()
Out[197]:
| ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | final_sentiment | trust_score | sentiment_polarity | ml_sentiment_score | review_class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | 2 | 0.045593 | 0.700000 | 1 | positive |
| 1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | 2 | 0.132725 | 0.329167 | 1 | positive |
| 2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | 2 | 0.300912 | 0.364583 | 1 | positive |
| 3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | 2 | 0.249240 | 0.687500 | 1 | positive |
| 4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | 2 | 0.068896 | 0.500000 | 1 | positive |
Pattern Mining¶
In [198]:
def extract_patterns(df):
# Create aspects dictionary
aspects = {
"accessibility": [
"accessibility",
"transport",
"transportation",
"road",
"roads",
"access",
"reach",
"distance",
],
"accommodation": [
"hotel",
"bed",
"room",
"stay",
"house",
"accommodation",
"garden",
],
"culture": [
"temple",
"culture",
"tradition",
"heritage",
"Buddha",
"Buddhism",
"Buddhist",
"Hindu",
"Hinduism",
],
"food": ["food", "restaurant", "breakfast", "dinner", "delicious", "cuisine"],
"nature": [
"mountain",
"mountains",
"view",
"lake",
"scenery",
"landscape",
"nature",
"heaven",
"Himalayas",
"Everest",
"Sagarmatha",
],
"religion": [
"temple",
"monastery",
"stupa",
"Stupa",
"Spirituality",
"pagoda",
"shrine",
"religion",
"religious",
"monk",
"nun",
"priest",
],
"safety": [
"safe",
"security",
"dangerous",
"risk",
"risky",
"safety",
"secure",
"alert",
"crime",
"scam",
"fraud",
],
}
# Extract aspects from reviews (similar to association rule mining)
for aspect, keywords in aspects.items():
df[aspect] = (
df["cleaned_review"]
.fillna("")
.apply(lambda x: any(word in str(x).lower() for word in keywords))
)
return df
# Extract patterns from reviews
df = extract_patterns(df)
df
Out[198]:
| ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | sentiment_polarity | ml_sentiment_score | review_class | accessibility | accommodation | culture | food | nature | religion | safety | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | 0.700000 | 1 | positive | False | False | False | False | True | False | False |
| 1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | 0.329167 | 1 | positive | False | False | False | False | False | True | False |
| 2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | 0.364583 | 1 | positive | False | False | True | False | True | True | False |
| 3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | 0.687500 | 1 | positive | False | False | True | False | False | True | False |
| 4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | 0.500000 | 1 | positive | False | False | False | False | True | True | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7266 | 7267 | Pokhara | 9.0 | It's a nice place to sit back, and enjoy. The ... | nice place sit back enjoy calm fresh air lake ... | 9 | [nice, place, sit, back, enjoy, calm, fresh, a... | 2 | 2 | 0.9456 | ... | 0.312500 | 1 | positive | False | False | False | False | True | False | False |
| 7267 | 7268 | Pokhara | 3.0 | Excellent Place to visit, Lifetime memories | excellent place visit lifetime memories | 3 | [excellent, place, visit, lifetime, memories] | 2 | 2 | 0.5719 | ... | 1.000000 | 1 | positive | False | False | False | False | False | False | False |
| 7268 | 7269 | Pokhara | 79.0 | It's very photogenic and relaxing when there a... | photogenic relaxing many people | 79 | [photogenic, relaxing, many, people] | 1 | 1 | 0.4939 | ... | 0.500000 | 0 | neutral | False | False | False | False | False | False | False |
| 7269 | 7270 | Pokhara | 14.0 | U can get real definition of nature's beauty a... | u get real definition nature beauty peace | 14 | [u, get, real, definition, nature, beauty, peace] | 2 | 2 | 0.8074 | ... | 0.200000 | 0 | neutral | False | False | False | False | True | False | False |
| 7270 | 7271 | Pokhara | 58.0 | Best. Walking please beautiful views | best walking please beautiful views | 58 | [best, walking, please, beautiful, views] | 2 | 2 | 0.8860 | ... | 0.925000 | 1 | positive | False | False | False | False | True | False | False |
7102 rows × 22 columns
Similarity-based Analysis¶
In [199]:
def calculate_similarities(df):
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=1000, stop_words="english")
text_matrix = tfidf.fit_transform(df["cleaned_review"].fillna(""))
# PCA for dimension reduction
pca = PCA(n_components=2)
text_pca = pca.fit_transform(text_matrix.toarray())
df["pca1"] = text_pca[:, 0]
df["pca2"] = text_pca[:, 1]
return df, tfidf.get_feature_names_out()
# Calculate similarities using TF-IDF and PCA
df, feature_names = calculate_similarities(df)
df
Out[199]:
| ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | review_class | accessibility | accommodation | culture | food | nature | religion | safety | pca1 | pca2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | positive | False | False | False | False | True | False | False | 0.086252 | -0.073357 |
| 1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | positive | False | False | False | False | False | True | False | 0.158235 | -0.086863 |
| 2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | positive | False | False | True | False | True | True | False | 0.039676 | -0.027877 |
| 3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | positive | False | False | True | False | False | True | False | -0.088295 | -0.026823 |
| 4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | positive | False | False | False | False | True | True | False | 0.080956 | -0.090689 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7266 | 7267 | Pokhara | 9.0 | It's a nice place to sit back, and enjoy. The ... | nice place sit back enjoy calm fresh air lake ... | 9 | [nice, place, sit, back, enjoy, calm, fresh, a... | 2 | 2 | 0.9456 | ... | positive | False | False | False | False | True | False | False | 0.035502 | 0.141770 |
| 7267 | 7268 | Pokhara | 3.0 | Excellent Place to visit, Lifetime memories | excellent place visit lifetime memories | 3 | [excellent, place, visit, lifetime, memories] | 2 | 2 | 0.5719 | ... | positive | False | False | False | False | False | False | False | 0.087738 | -0.041908 |
| 7268 | 7269 | Pokhara | 79.0 | It's very photogenic and relaxing when there a... | photogenic relaxing many people | 79 | [photogenic, relaxing, many, people] | 1 | 1 | 0.4939 | ... | neutral | False | False | False | False | False | False | False | -0.090147 | 0.020505 |
| 7269 | 7270 | Pokhara | 14.0 | U can get real definition of nature's beauty a... | u get real definition nature beauty peace | 14 | [u, get, real, definition, nature, beauty, peace] | 2 | 2 | 0.8074 | ... | neutral | False | False | False | False | True | False | False | -0.087003 | 0.007280 |
| 7270 | 7271 | Pokhara | 58.0 | Best. Walking please beautiful views | best walking please beautiful views | 58 | [best, walking, please, beautiful, views] | 2 | 2 | 0.8860 | ... | positive | False | False | False | False | True | False | False | 0.018753 | -0.067533 |
7102 rows × 24 columns
Network Analysis¶
In [200]:
def create_aspect_network(df):
G = nx.Graph()
aspects = [
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
# Create edges between aspects that commonly co-occur
for i in range(len(aspects)):
for j in range(i + 1, len(aspects)):
weight = df[df[aspects[i]] & df[aspects[j]]].shape[0]
if weight > 0:
G.add_edge(aspects[i], aspects[j], weight=weight)
return G
# Create aspect network
G = create_aspect_network(df)
G.edges(data=True)
# Display EdgeDataView in a table format
print("\nAspect Network:")
aspect = pd.DataFrame(G.edges(data=True), columns=["Aspect1", "Aspect2", "Weight"])
aspect
Aspect Network:
Out[200]:
| Aspect1 | Aspect2 | Weight | |
|---|---|---|---|
| 0 | accessibility | accommodation | {'weight': 9} |
| 1 | accessibility | culture | {'weight': 34} |
| 2 | accessibility | food | {'weight': 8} |
| 3 | accessibility | nature | {'weight': 46} |
| 4 | accessibility | religion | {'weight': 42} |
| 5 | accessibility | safety | {'weight': 2} |
| 6 | accommodation | culture | {'weight': 54} |
| 7 | accommodation | food | {'weight': 48} |
| 8 | accommodation | nature | {'weight': 66} |
| 9 | accommodation | religion | {'weight': 44} |
| 10 | accommodation | safety | {'weight': 3} |
| 11 | culture | food | {'weight': 63} |
| 12 | culture | nature | {'weight': 246} |
| 13 | culture | religion | {'weight': 1129} |
| 14 | culture | safety | {'weight': 9} |
| 15 | food | nature | {'weight': 112} |
| 16 | food | religion | {'weight': 44} |
| 17 | food | safety | {'weight': 4} |
| 18 | nature | religion | {'weight': 295} |
| 19 | nature | safety | {'weight': 14} |
| 20 | religion | safety | {'weight': 12} |
In [201]:
# Draw the aspect network
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)
nx.draw_networkx_nodes(G, pos, node_size=1000, node_color="skyblue")
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5, edge_color="gray")
nx.draw_networkx_labels(G, pos, font_size=12, font_color="black")
plt.title("Aspect Network", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.show()
Segmentation Analysis¶
In [202]:
def analyze_by_trust_segments(df):
df["trust_segment"] = pd.qcut(
df["trust_score"],
q=5,
labels=["Very Low", "Low", "Medium", "High", "Very High"],
)
segment_analysis = df.groupby("trust_segment", observed=False).agg(
{
"ml_sentiment_score": "mean",
"accessibility": "mean",
"accommodation": "mean",
"culture": "mean",
"food": "mean",
"nature": "mean",
"religion": "mean",
"safety": "mean",
}
)
return segment_analysis
# Analyze by trust score segments
segment_analysis = analyze_by_trust_segments(df)
segment_analysis
Out[202]:
| ml_sentiment_score | accessibility | accommodation | culture | food | nature | religion | safety | |
|---|---|---|---|---|---|---|---|---|
| trust_segment | ||||||||
| Very Low | 0.675746 | 0.014916 | 0.033074 | 0.151751 | 0.027886 | 0.167315 | 0.136187 | 0.007782 |
| Low | 0.671111 | 0.018519 | 0.029630 | 0.215556 | 0.031111 | 0.194074 | 0.196296 | 0.008148 |
| Medium | 0.690632 | 0.018155 | 0.031954 | 0.227306 | 0.040668 | 0.202614 | 0.227306 | 0.008715 |
| High | 0.662429 | 0.026130 | 0.038136 | 0.247881 | 0.052966 | 0.199859 | 0.243644 | 0.007062 |
| Very High | 0.594213 | 0.026817 | 0.044460 | 0.268878 | 0.035992 | 0.204658 | 0.255469 | 0.011291 |
In [203]:
# Print the data frame to check the changes
df.head()
Out[203]:
| ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | accessibility | accommodation | culture | food | nature | religion | safety | pca1 | pca2 | trust_segment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | False | False | False | False | True | False | False | 0.086252 | -0.073357 | Medium |
| 1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | False | False | False | False | False | True | False | 0.158235 | -0.086863 | Very High |
| 2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | False | False | True | False | True | True | False | 0.039676 | -0.027877 | Very High |
| 3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | False | False | True | False | False | True | False | -0.088295 | -0.026823 | Very High |
| 4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | False | False | False | False | True | True | False | 0.080956 | -0.090689 | High |
5 rows × 25 columns
In [204]:
# Generate insights
insights = {
"overall_sentiment": df["ml_sentiment_score"].mean(),
"positive_reviews": (df["review_class"] == "positive").mean(),
"top_locations": df.groupby("location")["ml_sentiment_score"]
.mean()
.sort_values(ascending=False),
"aspect_frequencies": df[
[
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
].mean(),
"trust_segment_analysis": segment_analysis,
}
# Print key insights
print(f"Overall sentiment: {insights['overall_sentiment']:.3f}")
print("\nPositive review percentage: {:.1%}".format(insights["positive_reviews"]))
Overall sentiment: 0.659 Positive review percentage: 67.2%
Aspect Based Sentiment Analysis¶
In [205]:
def investigate_negative_reviews(df):
# Filter for negative and neutral reviews
negative_reviews = df[df["review_class"] == "negative"]
# Analyze aspect mentions in negative/neutral reviews
aspect_issues = negative_reviews[
[
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
].mean()
# Analyze sentiment score distribution
sentiment_distribution = negative_reviews["ml_sentiment_score"].describe()
# Extract common complaint phrases
top_complaints = (
negative_reviews["cleaned_review"]
.str.lower()
.str.split()
.explode()
.value_counts()
.head(30)
)
return {
"aspect_issues": aspect_issues,
"sentiment_distribution": sentiment_distribution,
"top_complaints": top_complaints,
}
# Investigate negative reviews
negative_insights = investigate_negative_reviews(df)
print("\nNegative Review Insights:")
print("Aspect Issues:")
print(negative_insights["aspect_issues"])
print("\nSentiment Distribution:")
print(negative_insights["sentiment_distribution"])
print("\nTop Complaint Phrases:")
print(negative_insights["top_complaints"])
Negative Review Insights: Aspect Issues: accessibility 0.051546 accommodation 0.041237 culture 0.134021 food 0.020619 nature 0.144330 religion 0.134021 safety 0.000000 dtype: float64 Sentiment Distribution: count 97.0 mean -1.0 std 0.0 min -1.0 25% -1.0 50% -1.0 75% -1.0 max -1.0 Name: ml_sentiment_score, dtype: float64 Top Complaint Phrases: cleaned_review place 33 base 29 camp 27 everest 14 visit 10 nepal 10 see 10 annapurna 9 trek 9 temple 9 least 8 elephants 7 life 7 time 7 go 7 one 7 dirty 7 even 7 expensive 6 like 6 park 6 must 6 get 6 jungle 6 take 5 earthquake 5 walk 5 never 5 guide 5 safari 5 Name: count, dtype: int64
In [206]:
def plot_aspect_issues(aspect_issues):
plt.figure(figsize=(10, 6))
sns.barplot(
x=aspect_issues.index,
y=aspect_issues.values,
hue=aspect_issues.index,
palette="viridis",
legend=False,
)
plt.title("Aspect Issues in Negative Reviews", fontsize=16)
plt.ylabel("Average Mention Rate", fontsize=12)
plt.xlabel("Aspects", fontsize=12)
plt.show()
# Plot aspect issues in negative reviews
plot_aspect_issues(negative_insights["aspect_issues"])
In [207]:
def plot_wordcloud(top_complaints):
wordcloud = WordCloud(
width=800, height=400, background_color="white"
).generate_from_frequencies(top_complaints.to_dict())
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Top Complaint Phrases", fontsize=16)
plt.show()
# Plot word cloud of top complaint phrases
plot_wordcloud(negative_insights["top_complaints"])
In [208]:
def generate_positive_insights(df):
# Filter for positive reviews
positive_reviews = df[df["review_class"] == "positive"]
# Extract common positive phrases
positive_phrases = (
positive_reviews["cleaned_review"]
.str.lower()
.str.split()
.explode()
.value_counts()
.head(20)
)
return positive_phrases
# Generate positive review insights
positive_phrases = generate_positive_insights(df)
# Plot word cloud of positive phrases
def plot_wordcloud_positive(positive_phrases):
wordcloud = WordCloud(
width=800, height=400, background_color="white", colormap="viridis"
).generate_from_frequencies(positive_phrases.to_dict())
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Positive Phrases Word Cloud", fontsize=16)
plt.show()
plot_wordcloud_positive(positive_phrases)
In [209]:
def segment_reviews_by_type(df, tourist_types):
segmented_data = {}
for tourist_type, keywords in tourist_types.items():
# Filter reviews containing keywords for this tourist type
mask = df["cleaned_review"].str.contains(
"|".join(keywords), case=False, na=False
)
type_reviews = df[mask]
# Calculate sentiment and aspect mentions for this type
sentiment = type_reviews["ml_sentiment_score"].mean()
aspect_issues = type_reviews[
[
"accessibility",
"accommodation",
"culture",
"food",
"nature",
"religion",
"safety",
]
].mean()
# Store results
segmented_data[tourist_type] = {
"review_count": type_reviews.shape[0],
"average_sentiment": sentiment,
"aspect_issues": aspect_issues,
}
return segmented_data
# Define tourist types and associated keywords
tourist_types = {
"adventure": ["trek", "hike", "climb", "adventure"],
"cultural": ["temple", "heritage", "festival", "culture"],
"religious": ["pilgrimage", "sacred", "shrine", "god", "prayer"],
}
tourist_segment_analysis = segment_reviews_by_type(df, tourist_types)
print("\nTourist Segment Analysis:")
for type_, data in tourist_segment_analysis.items():
print(f"{type_} - Average Sentiment: {data['average_sentiment']:.3f}")
print(f"Aspect Issues:\n{data['aspect_issues']}")
Tourist Segment Analysis: adventure - Average Sentiment: 0.711 Aspect Issues: accessibility 0.084986 accommodation 0.062323 culture 0.076487 food 0.056657 nature 0.382436 religion 0.093484 safety 0.016997 dtype: float64 cultural - Average Sentiment: 0.595 Aspect Issues: accessibility 0.021879 accommodation 0.035393 culture 0.992921 food 0.041184 nature 0.157658 religion 0.726512 safety 0.005792 dtype: float64 religious - Average Sentiment: 0.439 Aspect Issues: accessibility 0.022305 accommodation 0.074349 culture 0.591078 food 0.007435 nature 0.115242 religion 0.680297 safety 0.000000 dtype: float64
In [210]:
# Prepare data for visualization
tourist_types = list(tourist_segment_analysis.keys())
review_counts = [data["review_count"] for data in tourist_segment_analysis.values()]
average_sentiments = [
data["average_sentiment"] for data in tourist_segment_analysis.values()
]
aspect_issues_df = pd.DataFrame(
[data["aspect_issues"] for data in tourist_segment_analysis.values()],
index=tourist_types,
)
# Bar chart for Average Sentiment Scores
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, average_sentiments, color=["teal", "orange", "purple"])
plt.title("Average Sentiment by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Average Sentiment", fontsize=12)
plt.ylim(0, 1) # Sentiment is likely normalized between 0 and 1
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
In [211]:
# Heatmap for Aspect Issues
plt.figure(figsize=(10, 6))
sns.heatmap(
aspect_issues_df,
annot=True,
cmap="YlGnBu",
fmt=".2f",
cbar_kws={"label": "Mean Value"},
)
plt.title("Aspect Issues Across Tourist Types", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Tourist Type", fontsize=12)
plt.xticks(rotation=45)
plt.show()
In [212]:
# Bar chart for Review Counts
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, review_counts, color=["skyblue", "salmon", "lightgreen"])
plt.title("Review Counts by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Review Count", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
In [213]:
def compare_regions(df):
# Group by location
location_stats = (
df.groupby("location")
.agg(
{
"ml_sentiment_score": ["mean", "std"],
"accessibility": "mean",
"accommodation": "mean",
"culture": "mean",
"food": "mean",
"nature": "mean",
"religion": "mean",
"safety": "mean",
}
)
.sort_values(("ml_sentiment_score", "mean"), ascending=False)
)
# Identify top complaints for each location
complaint_keywords = df.groupby("location")["cleaned_review"].apply(
lambda texts: texts.str.lower().str.split().explode().value_counts().head(5)
)
return location_stats, complaint_keywords
region_stats, location_complaints = compare_regions(df)
print("\nRegional Sentiment Comparison:")
print(region_stats)
print("\nTop Complaints by Region:")
print(location_complaints)
Regional Sentiment Comparison:
ml_sentiment_score accessibility \
mean std mean
location
Pokhara 0.830128 0.405827 0.017094
Syambhunath 0.727957 0.454819 0.046237
Chitwan National Park 0.723656 0.477682 0.018280
Langtang 0.704309 0.475841 0.031204
Bardiya National Park 0.680672 0.480824 0.018487
Bhaktapur Durbar Square 0.664516 0.490303 0.010753
Pasupatinath Temple 0.540860 0.529991 0.007527
Annapurna Base Camp 0.484848 0.682215 0.030303
Lumbini 0.473797 0.518510 0.009626
Everest Base Camp Trek 0.405405 0.692537 0.090090
accommodation culture food nature religion \
mean mean mean mean mean
location
Pokhara 0.051282 0.071581 0.141026 0.533120 0.073718
Syambhunath 0.012903 0.354839 0.019355 0.366667 0.541935
Chitwan National Park 0.064516 0.079570 0.019355 0.154839 0.016129
Langtang 0.029718 0.026746 0.016345 0.239227 0.002972
Bardiya National Park 0.045378 0.013445 0.008403 0.097479 0.006723
Bhaktapur Durbar Square 0.027957 0.411828 0.078495 0.030108 0.191398
Pasupatinath Temple 0.013978 0.523656 0.002151 0.058065 0.550538
Annapurna Base Camp 0.022727 0.000000 0.022727 0.348485 0.000000
Lumbini 0.041711 0.217112 0.003209 0.019251 0.225668
Everest Base Camp Trek 0.036036 0.000000 0.018018 0.207207 0.000000
safety
mean
location
Pokhara 0.010684
Syambhunath 0.006452
Chitwan National Park 0.013978
Langtang 0.005944
Bardiya National Park 0.011765
Bhaktapur Durbar Square 0.004301
Pasupatinath Temple 0.007527
Annapurna Base Camp 0.007576
Lumbini 0.007487
Everest Base Camp Trek 0.018018
Top Complaints by Region:
location
Annapurna Base Camp place 37
annapurna 35
amazing 25
trek 22
one 21
Bardiya National Park place 210
park 162
national 135
bardiya 77
good 71
Bhaktapur Durbar Square place 693
bhaktapur 306
square 250
visit 240
durbar 217
Chitwan National Park place 431
park 383
national 288
safari 248
one 231
Everest Base Camp Trek everest 31
base 29
camp 29
experience 22
place 20
Langtang place 216
beautiful 111
trekking 92
park 92
langtang 86
Lumbini place 718
buddha 411
temple 233
birth 232
lord 217
Pasupatinath Temple temple 624
place 477
visit 256
hindu 234
one 205
Pokhara place 641
lake 414
beautiful 257
pokhara 243
visit 177
Syambhunath place 634
kathmandu 478
temple 305
view 274
visit 240
Name: cleaned_review, dtype: int64
In [214]:
# Regional Sentiment Comparison
plt.figure(figsize=(10, 6))
sentiment_data = region_stats[("ml_sentiment_score", "mean")].sort_values()
sentiment_data.plot(kind="barh", color="skyblue", edgecolor="black")
plt.title("Mean Sentiment Scores by Region", fontsize=14)
plt.xlabel("Mean Sentiment Score", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()
In [215]:
# Aspect Metrics Heatmap
plt.figure(figsize=(12, 8))
aspect_data = region_stats.xs("mean", axis=1, level=1).iloc[
:, 1:
] # Exclude sentiment score
sns.heatmap(aspect_data, annot=True, cmap="coolwarm", cbar_kws={"label": "Mean Value"})
plt.title("Aspect Metrics by Region", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()
In [216]:
# Correlation between sentiment and aspects
correlations = region_stats.corr()["ml_sentiment_score"]["mean"]
print("Correlation between sentiment and aspects:\n", correlations)
Correlation between sentiment and aspects:
ml_sentiment_score mean 1.000000
std -0.877118
accessibility mean -0.380292
accommodation mean 0.298009
culture mean 0.015219
food mean 0.559874
nature mean 0.423403
religion mean 0.009081
safety mean -0.214823
Name: mean, dtype: float64
In [217]:
df.describe()
Out[217]:
| ID | total_review | total_review_count | ml_sentiment | ensemble_sentiment | lexicon_sentiment | final_sentiment | trust_score | sentiment_polarity | ml_sentiment_score | pca1 | pca2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7102.000000 | 7.102000e+03 | 7.102000e+03 |
| mean | 3631.451563 | 80.566742 | 80.566742 | 1.415517 | 1.469023 | 0.527836 | 1.418051 | 0.080615 | 0.384124 | 0.658828 | 4.001930e-18 | 4.802316e-17 |
| std | 2117.239644 | 119.807382 | 119.807382 | 0.881742 | 0.810323 | 0.347161 | 0.878782 | 0.121385 | 0.318223 | 0.502121 | 1.443194e-01 | 1.361941e-01 |
| min | 1.000000 | 1.000000 | 1.000000 | -2.000000 | -2.000000 | -0.851900 | -2.000000 | 0.000000 | -1.000000 | -1.000000 | -2.904202e-01 | -4.222869e-01 |
| 25% | 1809.250000 | 13.000000 | 13.000000 | 1.000000 | 1.000000 | 0.340000 | 1.000000 | 0.012158 | 0.100000 | 0.000000 | -9.846385e-02 | -4.854880e-02 |
| 50% | 3584.500000 | 38.000000 | 38.000000 | 2.000000 | 2.000000 | 0.624900 | 2.000000 | 0.037487 | 0.386970 | 1.000000 | -3.502423e-02 | -4.703180e-03 |
| 75% | 5495.750000 | 94.000000 | 94.000000 | 2.000000 | 2.000000 | 0.812600 | 2.000000 | 0.094225 | 0.600000 | 1.000000 | 6.933070e-02 | 2.404595e-02 |
| max | 7271.000000 | 988.000000 | 988.000000 | 2.000000 | 2.000000 | 0.989500 | 2.000000 | 1.000000 | 1.000000 | 1.000000 | 7.903685e-01 | 8.742405e-01 |
In [218]:
# Prepare data
correlation_values = correlations[1:] # Exclude "mean" which is always 1.0
aspects = correlation_values.index.get_level_values(0)
# Plot
plt.figure(figsize=(10, 6))
correlation_values.plot(kind="barh", color="steelblue", edgecolor="black")
plt.axvline(0, color="black", linewidth=0.8, linestyle="--")
plt.title("Correlation Between Sentiment and Aspects", fontsize=14)
plt.xlabel("Correlation Coefficient", fontsize=12)
plt.ylabel("Aspects", fontsize=12)
plt.tight_layout()
plt.show()
In [219]:
# Print the data frame to check the changes
df.head()
Out[219]:
| ID | location | total_review | review | cleaned_review | total_review_count | review_tokens | ml_sentiment | ensemble_sentiment | lexicon_sentiment | ... | accessibility | accommodation | culture | food | nature | religion | safety | pca1 | pca2 | trust_segment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Syambhunath | 46.0 | It is at the top of valleys mountain. Best pl... | top valleys mountain best place get pleasure r... | 46 | [top, valleys, mountain, best, place, get, ple... | 2 | 2 | 0.9738 | ... | False | False | False | False | True | False | False | 0.086252 | -0.073357 | Medium |
| 1 | 2 | Syambhunath | 132.0 | This place has a significant importance in Bud... | place significant importance buddhism visited ... | 132 | [place, significant, importance, buddhism, vis... | 2 | 2 | 0.9325 | ... | False | False | False | False | False | True | False | 0.158235 | -0.086863 | Very High |
| 2 | 3 | Syambhunath | 298.0 | Visited this from the other side on a rainy ev... | visited side rainy evening actually visit temp... | 298 | [visited, side, rainy, evening, actually, visi... | 2 | 2 | 0.8860 | ... | False | False | True | False | True | True | False | 0.039676 | -0.027877 | Very High |
| 3 | 4 | Syambhunath | 247.0 | A beautiful temple situated in the capital wit... | beautiful temple situated capital stunning vie... | 247 | [beautiful, temple, situated, capital, stunnin... | 2 | 2 | 0.9531 | ... | False | False | True | False | False | True | False | -0.088295 | -0.026823 | Very High |
| 4 | 5 | Syambhunath | 69.0 | great, beautiful, historic & religious place..... | great beautiful historic religious place crowd... | 69 | [great, beautiful, historic, religious, place,... | 2 | 2 | 0.9468 | ... | False | False | False | False | True | True | False | 0.080956 | -0.090689 | High |
5 rows × 25 columns