In [172]:
from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from textblob import TextBlob
import re
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
from wordcloud import WordCloud
In [173]:
# Load the data
df = pd.read_csv("nepal_tourist_reviews.csv")
In [174]:
# Basic data exploration
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())
Dataset Shape: (7271, 4)

Columns: ['ID', 'location', 'total review', 'review']

Missing Values:
 ID                0
location          0
total review     38
review          169
dtype: int64

Data Cleaning & Preprocessing¶

In [175]:
# Drop missing values
df.dropna(inplace=True)
In [176]:
# Rename 'total review' to 'total_review'
df.rename(columns={"total review": "total_review"}, inplace=True)
In [177]:
# Print the data frame to check the changes
df.head()
Out[177]:
ID location total_review review
0 1 Syambhunath 46 reviews It is at the top of valleys mountain. Best pl...
1 2 Syambhunath 132 reviews This place has a significant importance in Bud...
2 3 Syambhunath 298 reviews Visited this from the other side on a rainy ev...
3 4 Syambhunath 247 reviews A beautiful temple situated in the capital wit...
4 5 Syambhunath 69 reviews great, beautiful, historic & religious place.....
In [178]:
# Check the missing values
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
Dataset Shape: (7102, 4)

Missing Values:
 ID              0
location        0
total_review    0
review          0
dtype: int64
In [179]:
# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


def preprocess_text(text):
    # Lowercase and tokenize
    words = re.findall(r"\b\w+\b", text.lower())
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


# Apply the preprocessing function to the reviews
df["cleaned_review"] = df["review"].apply(preprocess_text)
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anupamabhatta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
In [180]:
# Print the data frame to check the changes
df.head()
Out[180]:
ID location total_review review cleaned_review
0 1 Syambhunath 46 reviews It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r...
1 2 Syambhunath 132 reviews This place has a significant importance in Bud... place significant importance buddhism visited ...
2 3 Syambhunath 298 reviews Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp...
3 4 Syambhunath 247 reviews A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie...
4 5 Syambhunath 69 reviews great, beautiful, historic & religious place..... great beautiful historic religious place crowd...

Exploratory Data Analysis (EDA)¶

In [181]:
# Clean the 'total_review' column - convert to string, extract numbers and handle NaN values
df["total_review_count"] = pd.to_numeric(
    df["total_review"].astype(str).str.extract(r"(\d+)")[0],
    errors="coerce",  # This will convert errors to NaN
)

# Basic statistics of review counts
print("\nReview Count Statistics:")
print(df["total_review_count"].describe())
Review Count Statistics:
count    7102.000000
mean       80.566742
std       119.807382
min         1.000000
25%        13.000000
50%        38.000000
75%        94.000000
max       988.000000
Name: total_review_count, dtype: float64
In [182]:
# Location analysis
print("\nNumber of Unique Locations:", df["location"].nunique())
print("\nTop 10 Locations by Frequency:")
print(df["location"].value_counts())
Number of Unique Locations: 10

Top 10 Locations by Frequency:
location
Pokhara                    936
Lumbini                    935
Syambhunath                930
Bhaktapur Durbar Square    930
Chitwan National Park      930
Pasupatinath Temple        930
Langtang                   673
Bardiya National Park      595
Annapurna Base Camp        132
Everest Base Camp Trek     111
Name: count, dtype: int64
In [183]:
# Let's also look at the distribution of reviews per location
location_review_stats = (
    df.groupby("location")
    .agg(
        {
            "ID": "count",  # Number of reviews per location
            "total_review_count": [
                "mean",
                "min",
                "max",
            ],  # Statistics of the 'total review' numbers
        }
    )
    .round(2)
)

print("\nLocation Statistics:")
print(location_review_stats)
Location Statistics:
                           ID total_review_count         
                        count               mean min  max
location                                                 
Annapurna Base Camp       132              95.93   1  834
Bardiya National Park     595              73.95   1  874
Bhaktapur Durbar Square   930              81.09   1  917
Chitwan National Park     930              82.14   1  897
Everest Base Camp Trek    111              90.84   1  674
Langtang                  673              77.23   1  916
Lumbini                   935              90.99   1  917
Pasupatinath Temple       930              70.68   1  936
Pokhara                   936              72.19   1  908
Syambhunath               930              89.55   1  988
In [184]:
# Print the data frame to check the changes
df.head()
Out[184]:
ID location total_review review cleaned_review total_review_count
0 1 Syambhunath 46 reviews It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46
1 2 Syambhunath 132 reviews This place has a significant importance in Bud... place significant importance buddhism visited ... 132
2 3 Syambhunath 298 reviews Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298
3 4 Syambhunath 247 reviews A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247
4 5 Syambhunath 69 reviews great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69

EDA Visualization¶

In [185]:
# Distribution of total review counts per location
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=df,
    x="location",
    y="total_review_count",
    hue="location",
    palette="viridis",
    legend=False,
)
plt.title("Distribution of Total Reviews per Location", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Total Review Count", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [186]:
# Bar chart of top locations by review frequency
plt.figure(figsize=(10, 6))
sns.barplot(
    x=df["location"].value_counts().index,
    y=df["location"].value_counts().values,
    hue=df["location"].value_counts().index,
    palette="coolwarm",
    legend=False,
)
plt.title("Top Locations by Number of Reviews", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Number of Reviews", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [187]:
# Histogram of total review counts
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="total_review_count", bins=30, kde=True, color="skyblue")
plt.title("Distribution of Total Review Counts", fontsize=14)
plt.xlabel("Total Review Count", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [188]:
# Tokenize reviews and remove stopwords
stop_words = set(stopwords.words("english"))
df["review_tokens"] = (
    df["cleaned_review"]
    .dropna()
    .apply(
        lambda x: [
            word.lower()
            for word in word_tokenize(x)
            if word.isalpha() and word.lower() not in stop_words
        ]
    )
)

# Flatten the list of tokens to count word frequencies
all_tokens = [word for tokens in df["review_tokens"].dropna() for word in tokens]
word_counts = Counter(all_tokens)

# Top 30 most common words
common_words = word_counts.most_common(30)
print("\nTop 30 Most Common Words:", common_words)

# Visualize word frequency
words, counts = zip(*common_words)
plt.figure(figsize=(10, 6))
sns.barplot(
    x=list(words), y=list(counts), hue=list(words), palette="tab10", legend=False
)
plt.title("Top 30 Most Common Words in Reviews", fontsize=14)
plt.xlabel("Word", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
Top 30 Most Common Words: [('place', 4077), ('visit', 1353), ('temple', 1284), ('nepal', 1131), ('one', 1116), ('beautiful', 1084), ('kathmandu', 812), ('best', 771), ('good', 687), ('park', 648), ('nice', 585), ('see', 582), ('great', 565), ('world', 563), ('amazing', 539), ('view', 516), ('national', 510), ('heritage', 487), ('must', 479), ('site', 462), ('buddha', 451), ('also', 446), ('lake', 432), ('time', 408), ('many', 400), ('lord', 396), ('around', 365), ('peaceful', 356), ('people', 351), ('valley', 338)]
No description has been provided for this image
In [189]:
# Print the data frame to check the changes
df.head()
Out[189]:
ID location total_review review cleaned_review total_review_count review_tokens
0 1 Syambhunath 46 reviews It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46 [top, valleys, mountain, best, place, get, ple...
1 2 Syambhunath 132 reviews This place has a significant importance in Bud... place significant importance buddhism visited ... 132 [place, significant, importance, buddhism, vis...
2 3 Syambhunath 298 reviews Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298 [visited, side, rainy, evening, actually, visi...
3 4 Syambhunath 247 reviews A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247 [beautiful, temple, situated, capital, stunnin...
4 5 Syambhunath 69 reviews great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69 [great, beautiful, historic, religious, place,...

ML Model Training¶

In [190]:
class TourismSentimentClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=5000, ngram_range=(1, 2), stop_words="english"
        )

        # Initialize all classifiers
        self.classifiers = {
            "naive_bayes": MultinomialNB(),
            "svm": LinearSVC(random_state=42),
            "logistic_regression": LogisticRegression(random_state=42),
            "random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
            "knn": KNeighborsClassifier(n_neighbors=5),
        }

        self.trained_models = {}
        self.best_model = None
        self.best_model_name = None

    # Prepare initial labels using lexicon-based approach
    def prepare_initial_labels(self, df):
        analyzer = SentimentIntensityAnalyzer()

        def get_initial_label(text):
            scores = analyzer.polarity_scores(text)
            if scores["compound"] >= 0.5:
                return 2  # very positive
            elif scores["compound"] >= 0.1:
                return 1  # positive
            elif scores["compound"] <= -0.5:
                return -2  # very negative
            elif scores["compound"] <= -0.1:
                return -1  # negative
            return 0  # neutral

        return df["cleaned_review"].apply(get_initial_label)

    # Train and evaluate all classifiers
    def train_and_evaluate(self, df):
        # Prepare initial labels
        y = self.prepare_initial_labels(df)
        X = df["cleaned_review"]

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Train and evaluate each classifier
        results = {}
        for name, classifier in self.classifiers.items():
            # Create pipeline
            pipeline = Pipeline(
                [("vectorizer", self.vectorizer), ("classifier", classifier)]
            )

            # Train
            pipeline.fit(X_train, y_train)

            # Evaluate
            score = pipeline.score(X_test, y_test)
            y_pred = pipeline.predict(X_test)
            report = classification_report(y_test, y_pred)

            # Store results
            results[name] = {"score": score, "report": report, "model": pipeline}

            self.trained_models[name] = pipeline

        # Find best model
        best_score = -1
        for name, result in results.items():
            if result["score"] > best_score:
                best_score = result["score"]
                self.best_model = result["model"]
                self.best_model_name = name

        return results

    # Make predictions using the best model
    def predict(self, text):
        if self.best_model is None:
            raise ValueError("Models haven't been trained yet!")

        return self.best_model.predict([text])[0]

    # Make predictions using ensemble of all models
    def ensemble_predict(self, text):
        if not self.trained_models:
            raise ValueError("Models haven't been trained yet!")

        predictions = []
        for model in self.trained_models.values():
            pred = model.predict([text])[0]
            predictions.append(pred)

        return max(set(predictions), key=predictions.count)
In [191]:
# Comprehensive analysis of reviews using both ML and lexicon-based approaches
def analyze_reviews(df):
    # Initialize and train ML classifiers
    classifier = TourismSentimentClassifier()
    results = classifier.train_and_evaluate(df)

    # Add ML predictions to dataframe
    df["ml_sentiment"] = df["cleaned_review"].apply(classifier.predict)
    df["ensemble_sentiment"] = df["cleaned_review"].apply(classifier.ensemble_predict)

    # Add lexicon-based predictions (from previous implementation)
    analyzer = SentimentIntensityAnalyzer()
    df["lexicon_sentiment"] = df["cleaned_review"].apply(
        lambda x: analyzer.polarity_scores(x)["compound"]
    )

    # Combine predictions
    def get_final_sentiment(row):
        ml_sent = row["ml_sentiment"]
        lex_sent = row["lexicon_sentiment"]
        ensemble_sent = row["ensemble_sentiment"]

        # Weight and combine different approaches
        if abs(ml_sent - ensemble_sent) <= 1:  # ML models agree
            return ml_sent
        else:  # Use lexicon as tiebreaker
            return (
                2
                if lex_sent >= 0.5
                else 1
                if lex_sent >= 0.1
                else -2
                if lex_sent <= -0.5
                else -1
                if lex_sent <= -0.1
                else 0
            )

    df["final_sentiment"] = df.apply(get_final_sentiment, axis=1)

    return df, results
In [192]:
# Get insights about model performance
def get_model_insights(results):
    """
    Get insights about model performance
    """
    insights = {"best_model": None, "model_comparison": {}, "feature_importance": {}}

    best_score = -1
    for name, result in results.items():
        score = result["score"]
        insights["model_comparison"][name] = {
            "accuracy": score,
            "detailed_report": result["report"],
        }

        if score > best_score:
            best_score = score
            insights["best_model"] = name

    return insights
In [193]:
# Analyze reviews using both ML and lexicon-based approaches
df, model_results = analyze_reviews(df)

# Get insights about model performance
insights = get_model_insights(model_results)

print(f"Best performing model: {insights['best_model']}")
for model, metrics in insights["model_comparison"].items():
    print(f"\n{model} accuracy: {metrics['accuracy']:.3f}")
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_classes.py:31: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(
Best performing model: svm

naive_bayes accuracy: 0.685

svm accuracy: 0.820

logistic_regression accuracy: 0.787

random_forest accuracy: 0.807

knn accuracy: 0.331
In [194]:
# Print the data frame to check the changes
df.head()
Out[194]:
ID location total_review review cleaned_review total_review_count review_tokens ml_sentiment ensemble_sentiment lexicon_sentiment final_sentiment
0 1 Syambhunath 46 reviews It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46 [top, valleys, mountain, best, place, get, ple... 2 2 0.9738 2
1 2 Syambhunath 132 reviews This place has a significant importance in Bud... place significant importance buddhism visited ... 132 [place, significant, importance, buddhism, vis... 2 2 0.9325 2
2 3 Syambhunath 298 reviews Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298 [visited, side, rainy, evening, actually, visi... 2 2 0.8860 2
3 4 Syambhunath 247 reviews A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247 [beautiful, temple, situated, capital, stunnin... 2 2 0.9531 2
4 5 Syambhunath 69 reviews great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69 [great, beautiful, historic, religious, place,... 2 2 0.9468 2

ML Model Tuning & Testing¶

In [195]:
class TourismSentimentAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words="english",
            min_df=5,  # Ignore terms that appear in less than 5 documents
        )
        # Enhanced SVM parameters
        self.classifier = LinearSVC(
            dual=False,
            random_state=42,
            class_weight="balanced",
            C=1.0,  # Regularization parameter
            tol=1e-4,  # Tolerance for stopping criterion
        )
        self.scaler = MinMaxScaler()

    def fit(self, X, y):
        """Train the model with given data"""
        X_transformed = self.vectorizer.fit_transform(X)
        self.classifier.fit(X_transformed, y)
        return self

    def predict(self, X):
        """Predict sentiment for new data"""
        X_transformed = self.vectorizer.transform(X)
        return self.classifier.predict(X_transformed)


def improved_preprocess_data(df):
    """
    Enhanced preprocessing pipeline with improved SVM-based sentiment analysis
    """
    # Handle warnings
    warnings.filterwarnings("ignore", category=UserWarning)

    # Extract numeric part from 'total_review' and convert to float
    df["total_review"] = df["total_review"].str.extract(r"(\d+)").astype(float)

    # Calculate trust scores
    scaler = MinMaxScaler()
    df["trust_score"] = scaler.fit_transform(df[["total_review"]])

    # Calculate sentiment polarity using TextBlob
    df["sentiment_polarity"] = df["cleaned_review"].apply(
        lambda x: TextBlob(str(x)).sentiment.polarity
    )

    # Initialize and train the sentiment analyzer
    analyzer = TourismSentimentAnalyzer()

    # Create initial labels for training using TextBlob
    initial_sentiments = df["cleaned_review"].apply(
        lambda x: 1
        if TextBlob(str(x)).sentiment.polarity > 0.2
        else -1
        if TextBlob(str(x)).sentiment.polarity < -0.2
        else 0
    )

    # Train the SVM model
    analyzer.fit(df["cleaned_review"], initial_sentiments)

    # Apply ML-based sentiment analysis
    df["ml_sentiment_score"] = analyzer.predict(df["cleaned_review"])

    # Convert ML scores to final classification (positive/negative/neutral)
    df["review_class"] = df["ml_sentiment_score"].map(
        {1: "positive", 0: "neutral", -1: "negative"}
    )

    return df


# Function to apply the sentiment analysis to your main DataFrame
def apply_sentiment_analysis(df):
    """
    Apply the improved sentiment analysis to the main DataFrame
    """
    # Make sure the DataFrame has the required columns
    if "cleaned_review" not in df.columns:
        raise ValueError("DataFrame must contain 'cleaned_review' column")

    # Apply the improved preprocessing
    processed_df = improved_preprocess_data(df)

    return processed_df
In [196]:
# Apply the sentiment analysis
df = apply_sentiment_analysis(df)

# Now df contains:
# - trust_score: normalized score based on total_review
# - ml_sentiment_score: raw SVM predictions
# - review_class: final classification as "positive", "negative", or "neutral"

# Display the distribution of sentiment classes
print("\nSentiment Distribution:")
print(df["review_class"].value_counts())
Sentiment Distribution:
review_class
positive    4776
neutral     2229
negative      97
Name: count, dtype: int64
In [197]:
# Print the data frame to check the changes
df.head()
Out[197]:
ID location total_review review cleaned_review total_review_count review_tokens ml_sentiment ensemble_sentiment lexicon_sentiment final_sentiment trust_score sentiment_polarity ml_sentiment_score review_class
0 1 Syambhunath 46.0 It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46 [top, valleys, mountain, best, place, get, ple... 2 2 0.9738 2 0.045593 0.700000 1 positive
1 2 Syambhunath 132.0 This place has a significant importance in Bud... place significant importance buddhism visited ... 132 [place, significant, importance, buddhism, vis... 2 2 0.9325 2 0.132725 0.329167 1 positive
2 3 Syambhunath 298.0 Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298 [visited, side, rainy, evening, actually, visi... 2 2 0.8860 2 0.300912 0.364583 1 positive
3 4 Syambhunath 247.0 A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247 [beautiful, temple, situated, capital, stunnin... 2 2 0.9531 2 0.249240 0.687500 1 positive
4 5 Syambhunath 69.0 great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69 [great, beautiful, historic, religious, place,... 2 2 0.9468 2 0.068896 0.500000 1 positive

Pattern Mining¶

In [198]:
def extract_patterns(df):
    # Create aspects dictionary
    aspects = {
        "accessibility": [
            "accessibility",
            "transport",
            "transportation",
            "road",
            "roads",
            "access",
            "reach",
            "distance",
        ],
        "accommodation": [
            "hotel",
            "bed",
            "room",
            "stay",
            "house",
            "accommodation",
            "garden",
        ],
        "culture": [
            "temple",
            "culture",
            "tradition",
            "heritage",
            "Buddha",
            "Buddhism",
            "Buddhist",
            "Hindu",
            "Hinduism",
        ],
        "food": ["food", "restaurant", "breakfast", "dinner", "delicious", "cuisine"],
        "nature": [
            "mountain",
            "mountains",
            "view",
            "lake",
            "scenery",
            "landscape",
            "nature",
            "heaven",
            "Himalayas",
            "Everest",
            "Sagarmatha",
        ],
        "religion": [
            "temple",
            "monastery",
            "stupa",
            "Stupa",
            "Spirituality",
            "pagoda",
            "shrine",
            "religion",
            "religious",
            "monk",
            "nun",
            "priest",
        ],
        "safety": [
            "safe",
            "security",
            "dangerous",
            "risk",
            "risky",
            "safety",
            "secure",
            "alert",
            "crime",
            "scam",
            "fraud",
        ],
    }

    # Extract aspects from reviews (similar to association rule mining)
    for aspect, keywords in aspects.items():
        df[aspect] = (
            df["cleaned_review"]
            .fillna("")
            .apply(lambda x: any(word in str(x).lower() for word in keywords))
        )

    return df


# Extract patterns from reviews
df = extract_patterns(df)
df
Out[198]:
ID location total_review review cleaned_review total_review_count review_tokens ml_sentiment ensemble_sentiment lexicon_sentiment ... sentiment_polarity ml_sentiment_score review_class accessibility accommodation culture food nature religion safety
0 1 Syambhunath 46.0 It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46 [top, valleys, mountain, best, place, get, ple... 2 2 0.9738 ... 0.700000 1 positive False False False False True False False
1 2 Syambhunath 132.0 This place has a significant importance in Bud... place significant importance buddhism visited ... 132 [place, significant, importance, buddhism, vis... 2 2 0.9325 ... 0.329167 1 positive False False False False False True False
2 3 Syambhunath 298.0 Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298 [visited, side, rainy, evening, actually, visi... 2 2 0.8860 ... 0.364583 1 positive False False True False True True False
3 4 Syambhunath 247.0 A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247 [beautiful, temple, situated, capital, stunnin... 2 2 0.9531 ... 0.687500 1 positive False False True False False True False
4 5 Syambhunath 69.0 great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69 [great, beautiful, historic, religious, place,... 2 2 0.9468 ... 0.500000 1 positive False False False False True True False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7266 7267 Pokhara 9.0 It's a nice place to sit back, and enjoy. The ... nice place sit back enjoy calm fresh air lake ... 9 [nice, place, sit, back, enjoy, calm, fresh, a... 2 2 0.9456 ... 0.312500 1 positive False False False False True False False
7267 7268 Pokhara 3.0 Excellent Place to visit, Lifetime memories excellent place visit lifetime memories 3 [excellent, place, visit, lifetime, memories] 2 2 0.5719 ... 1.000000 1 positive False False False False False False False
7268 7269 Pokhara 79.0 It's very photogenic and relaxing when there a... photogenic relaxing many people 79 [photogenic, relaxing, many, people] 1 1 0.4939 ... 0.500000 0 neutral False False False False False False False
7269 7270 Pokhara 14.0 U can get real definition of nature's beauty a... u get real definition nature beauty peace 14 [u, get, real, definition, nature, beauty, peace] 2 2 0.8074 ... 0.200000 0 neutral False False False False True False False
7270 7271 Pokhara 58.0 Best. Walking please beautiful views best walking please beautiful views 58 [best, walking, please, beautiful, views] 2 2 0.8860 ... 0.925000 1 positive False False False False True False False

7102 rows × 22 columns

Similarity-based Analysis¶

In [199]:
def calculate_similarities(df):
    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(max_features=1000, stop_words="english")
    text_matrix = tfidf.fit_transform(df["cleaned_review"].fillna(""))

    # PCA for dimension reduction
    pca = PCA(n_components=2)
    text_pca = pca.fit_transform(text_matrix.toarray())
    df["pca1"] = text_pca[:, 0]
    df["pca2"] = text_pca[:, 1]

    return df, tfidf.get_feature_names_out()


# Calculate similarities using TF-IDF and PCA
df, feature_names = calculate_similarities(df)
df
Out[199]:
ID location total_review review cleaned_review total_review_count review_tokens ml_sentiment ensemble_sentiment lexicon_sentiment ... review_class accessibility accommodation culture food nature religion safety pca1 pca2
0 1 Syambhunath 46.0 It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46 [top, valleys, mountain, best, place, get, ple... 2 2 0.9738 ... positive False False False False True False False 0.086252 -0.073357
1 2 Syambhunath 132.0 This place has a significant importance in Bud... place significant importance buddhism visited ... 132 [place, significant, importance, buddhism, vis... 2 2 0.9325 ... positive False False False False False True False 0.158235 -0.086863
2 3 Syambhunath 298.0 Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298 [visited, side, rainy, evening, actually, visi... 2 2 0.8860 ... positive False False True False True True False 0.039676 -0.027877
3 4 Syambhunath 247.0 A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247 [beautiful, temple, situated, capital, stunnin... 2 2 0.9531 ... positive False False True False False True False -0.088295 -0.026823
4 5 Syambhunath 69.0 great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69 [great, beautiful, historic, religious, place,... 2 2 0.9468 ... positive False False False False True True False 0.080956 -0.090689
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7266 7267 Pokhara 9.0 It's a nice place to sit back, and enjoy. The ... nice place sit back enjoy calm fresh air lake ... 9 [nice, place, sit, back, enjoy, calm, fresh, a... 2 2 0.9456 ... positive False False False False True False False 0.035502 0.141770
7267 7268 Pokhara 3.0 Excellent Place to visit, Lifetime memories excellent place visit lifetime memories 3 [excellent, place, visit, lifetime, memories] 2 2 0.5719 ... positive False False False False False False False 0.087738 -0.041908
7268 7269 Pokhara 79.0 It's very photogenic and relaxing when there a... photogenic relaxing many people 79 [photogenic, relaxing, many, people] 1 1 0.4939 ... neutral False False False False False False False -0.090147 0.020505
7269 7270 Pokhara 14.0 U can get real definition of nature's beauty a... u get real definition nature beauty peace 14 [u, get, real, definition, nature, beauty, peace] 2 2 0.8074 ... neutral False False False False True False False -0.087003 0.007280
7270 7271 Pokhara 58.0 Best. Walking please beautiful views best walking please beautiful views 58 [best, walking, please, beautiful, views] 2 2 0.8860 ... positive False False False False True False False 0.018753 -0.067533

7102 rows × 24 columns

Network Analysis¶

In [200]:
def create_aspect_network(df):
    G = nx.Graph()
    aspects = [
        "accessibility",
        "accommodation",
        "culture",
        "food",
        "nature",
        "religion",
        "safety",
    ]

    # Create edges between aspects that commonly co-occur
    for i in range(len(aspects)):
        for j in range(i + 1, len(aspects)):
            weight = df[df[aspects[i]] & df[aspects[j]]].shape[0]
            if weight > 0:
                G.add_edge(aspects[i], aspects[j], weight=weight)

    return G


# Create aspect network
G = create_aspect_network(df)
G.edges(data=True)

# Display EdgeDataView in a table format
print("\nAspect Network:")
aspect = pd.DataFrame(G.edges(data=True), columns=["Aspect1", "Aspect2", "Weight"])
aspect
Aspect Network:
Out[200]:
Aspect1 Aspect2 Weight
0 accessibility accommodation {'weight': 9}
1 accessibility culture {'weight': 34}
2 accessibility food {'weight': 8}
3 accessibility nature {'weight': 46}
4 accessibility religion {'weight': 42}
5 accessibility safety {'weight': 2}
6 accommodation culture {'weight': 54}
7 accommodation food {'weight': 48}
8 accommodation nature {'weight': 66}
9 accommodation religion {'weight': 44}
10 accommodation safety {'weight': 3}
11 culture food {'weight': 63}
12 culture nature {'weight': 246}
13 culture religion {'weight': 1129}
14 culture safety {'weight': 9}
15 food nature {'weight': 112}
16 food religion {'weight': 44}
17 food safety {'weight': 4}
18 nature religion {'weight': 295}
19 nature safety {'weight': 14}
20 religion safety {'weight': 12}
In [201]:
# Draw the aspect network
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)
nx.draw_networkx_nodes(G, pos, node_size=1000, node_color="skyblue")
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5, edge_color="gray")
nx.draw_networkx_labels(G, pos, font_size=12, font_color="black")
plt.title("Aspect Network", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.show()
No description has been provided for this image

Segmentation Analysis¶

In [202]:
def analyze_by_trust_segments(df):
    df["trust_segment"] = pd.qcut(
        df["trust_score"],
        q=5,
        labels=["Very Low", "Low", "Medium", "High", "Very High"],
    )
    segment_analysis = df.groupby("trust_segment", observed=False).agg(
        {
            "ml_sentiment_score": "mean",
            "accessibility": "mean",
            "accommodation": "mean",
            "culture": "mean",
            "food": "mean",
            "nature": "mean",
            "religion": "mean",
            "safety": "mean",
        }
    )
    return segment_analysis


# Analyze by trust score segments
segment_analysis = analyze_by_trust_segments(df)
segment_analysis
Out[202]:
ml_sentiment_score accessibility accommodation culture food nature religion safety
trust_segment
Very Low 0.675746 0.014916 0.033074 0.151751 0.027886 0.167315 0.136187 0.007782
Low 0.671111 0.018519 0.029630 0.215556 0.031111 0.194074 0.196296 0.008148
Medium 0.690632 0.018155 0.031954 0.227306 0.040668 0.202614 0.227306 0.008715
High 0.662429 0.026130 0.038136 0.247881 0.052966 0.199859 0.243644 0.007062
Very High 0.594213 0.026817 0.044460 0.268878 0.035992 0.204658 0.255469 0.011291
In [203]:
# Print the data frame to check the changes
df.head()
Out[203]:
ID location total_review review cleaned_review total_review_count review_tokens ml_sentiment ensemble_sentiment lexicon_sentiment ... accessibility accommodation culture food nature religion safety pca1 pca2 trust_segment
0 1 Syambhunath 46.0 It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46 [top, valleys, mountain, best, place, get, ple... 2 2 0.9738 ... False False False False True False False 0.086252 -0.073357 Medium
1 2 Syambhunath 132.0 This place has a significant importance in Bud... place significant importance buddhism visited ... 132 [place, significant, importance, buddhism, vis... 2 2 0.9325 ... False False False False False True False 0.158235 -0.086863 Very High
2 3 Syambhunath 298.0 Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298 [visited, side, rainy, evening, actually, visi... 2 2 0.8860 ... False False True False True True False 0.039676 -0.027877 Very High
3 4 Syambhunath 247.0 A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247 [beautiful, temple, situated, capital, stunnin... 2 2 0.9531 ... False False True False False True False -0.088295 -0.026823 Very High
4 5 Syambhunath 69.0 great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69 [great, beautiful, historic, religious, place,... 2 2 0.9468 ... False False False False True True False 0.080956 -0.090689 High

5 rows × 25 columns

In [204]:
# Generate insights
insights = {
    "overall_sentiment": df["ml_sentiment_score"].mean(),
    "positive_reviews": (df["review_class"] == "positive").mean(),
    "top_locations": df.groupby("location")["ml_sentiment_score"]
    .mean()
    .sort_values(ascending=False),
    "aspect_frequencies": df[
        [
            "accessibility",
            "accommodation",
            "culture",
            "food",
            "nature",
            "religion",
            "safety",
        ]
    ].mean(),
    "trust_segment_analysis": segment_analysis,
}

# Print key insights
print(f"Overall sentiment: {insights['overall_sentiment']:.3f}")
print("\nPositive review percentage: {:.1%}".format(insights["positive_reviews"]))
Overall sentiment: 0.659

Positive review percentage: 67.2%

Aspect Based Sentiment Analysis¶

In [205]:
def investigate_negative_reviews(df):
    # Filter for negative and neutral reviews
    negative_reviews = df[df["review_class"] == "negative"]

    # Analyze aspect mentions in negative/neutral reviews
    aspect_issues = negative_reviews[
        [
            "accessibility",
            "accommodation",
            "culture",
            "food",
            "nature",
            "religion",
            "safety",
        ]
    ].mean()

    # Analyze sentiment score distribution
    sentiment_distribution = negative_reviews["ml_sentiment_score"].describe()

    # Extract common complaint phrases
    top_complaints = (
        negative_reviews["cleaned_review"]
        .str.lower()
        .str.split()
        .explode()
        .value_counts()
        .head(30)
    )

    return {
        "aspect_issues": aspect_issues,
        "sentiment_distribution": sentiment_distribution,
        "top_complaints": top_complaints,
    }


# Investigate negative reviews
negative_insights = investigate_negative_reviews(df)
print("\nNegative Review Insights:")
print("Aspect Issues:")
print(negative_insights["aspect_issues"])
print("\nSentiment Distribution:")
print(negative_insights["sentiment_distribution"])
print("\nTop Complaint Phrases:")
print(negative_insights["top_complaints"])
Negative Review Insights:
Aspect Issues:
accessibility    0.051546
accommodation    0.041237
culture          0.134021
food             0.020619
nature           0.144330
religion         0.134021
safety           0.000000
dtype: float64

Sentiment Distribution:
count    97.0
mean     -1.0
std       0.0
min      -1.0
25%      -1.0
50%      -1.0
75%      -1.0
max      -1.0
Name: ml_sentiment_score, dtype: float64

Top Complaint Phrases:
cleaned_review
place         33
base          29
camp          27
everest       14
visit         10
nepal         10
see           10
annapurna      9
trek           9
temple         9
least          8
elephants      7
life           7
time           7
go             7
one            7
dirty          7
even           7
expensive      6
like           6
park           6
must           6
get            6
jungle         6
take           5
earthquake     5
walk           5
never          5
guide          5
safari         5
Name: count, dtype: int64
In [206]:
def plot_aspect_issues(aspect_issues):
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=aspect_issues.index,
        y=aspect_issues.values,
        hue=aspect_issues.index,
        palette="viridis",
        legend=False,
    )
    plt.title("Aspect Issues in Negative Reviews", fontsize=16)
    plt.ylabel("Average Mention Rate", fontsize=12)
    plt.xlabel("Aspects", fontsize=12)
    plt.show()


# Plot aspect issues in negative reviews
plot_aspect_issues(negative_insights["aspect_issues"])
No description has been provided for this image
In [207]:
def plot_wordcloud(top_complaints):
    wordcloud = WordCloud(
        width=800, height=400, background_color="white"
    ).generate_from_frequencies(top_complaints.to_dict())
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Top Complaint Phrases", fontsize=16)
    plt.show()


# Plot word cloud of top complaint phrases
plot_wordcloud(negative_insights["top_complaints"])
No description has been provided for this image
In [208]:
def generate_positive_insights(df):
    # Filter for positive reviews
    positive_reviews = df[df["review_class"] == "positive"]

    # Extract common positive phrases
    positive_phrases = (
        positive_reviews["cleaned_review"]
        .str.lower()
        .str.split()
        .explode()
        .value_counts()
        .head(20)
    )

    return positive_phrases


# Generate positive review insights
positive_phrases = generate_positive_insights(df)


# Plot word cloud of positive phrases
def plot_wordcloud_positive(positive_phrases):
    wordcloud = WordCloud(
        width=800, height=400, background_color="white", colormap="viridis"
    ).generate_from_frequencies(positive_phrases.to_dict())
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Positive Phrases Word Cloud", fontsize=16)
    plt.show()


plot_wordcloud_positive(positive_phrases)
No description has been provided for this image
In [209]:
def segment_reviews_by_type(df, tourist_types):
    segmented_data = {}

    for tourist_type, keywords in tourist_types.items():
        # Filter reviews containing keywords for this tourist type
        mask = df["cleaned_review"].str.contains(
            "|".join(keywords), case=False, na=False
        )
        type_reviews = df[mask]

        # Calculate sentiment and aspect mentions for this type
        sentiment = type_reviews["ml_sentiment_score"].mean()
        aspect_issues = type_reviews[
            [
                "accessibility",
                "accommodation",
                "culture",
                "food",
                "nature",
                "religion",
                "safety",
            ]
        ].mean()

        # Store results
        segmented_data[tourist_type] = {
            "review_count": type_reviews.shape[0],
            "average_sentiment": sentiment,
            "aspect_issues": aspect_issues,
        }

    return segmented_data


# Define tourist types and associated keywords
tourist_types = {
    "adventure": ["trek", "hike", "climb", "adventure"],
    "cultural": ["temple", "heritage", "festival", "culture"],
    "religious": ["pilgrimage", "sacred", "shrine", "god", "prayer"],
}
tourist_segment_analysis = segment_reviews_by_type(df, tourist_types)
print("\nTourist Segment Analysis:")
for type_, data in tourist_segment_analysis.items():
    print(f"{type_} - Average Sentiment: {data['average_sentiment']:.3f}")
    print(f"Aspect Issues:\n{data['aspect_issues']}")
Tourist Segment Analysis:
adventure - Average Sentiment: 0.711
Aspect Issues:
accessibility    0.084986
accommodation    0.062323
culture          0.076487
food             0.056657
nature           0.382436
religion         0.093484
safety           0.016997
dtype: float64
cultural - Average Sentiment: 0.595
Aspect Issues:
accessibility    0.021879
accommodation    0.035393
culture          0.992921
food             0.041184
nature           0.157658
religion         0.726512
safety           0.005792
dtype: float64
religious - Average Sentiment: 0.439
Aspect Issues:
accessibility    0.022305
accommodation    0.074349
culture          0.591078
food             0.007435
nature           0.115242
religion         0.680297
safety           0.000000
dtype: float64
In [210]:
# Prepare data for visualization
tourist_types = list(tourist_segment_analysis.keys())
review_counts = [data["review_count"] for data in tourist_segment_analysis.values()]
average_sentiments = [
    data["average_sentiment"] for data in tourist_segment_analysis.values()
]
aspect_issues_df = pd.DataFrame(
    [data["aspect_issues"] for data in tourist_segment_analysis.values()],
    index=tourist_types,
)

# Bar chart for Average Sentiment Scores
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, average_sentiments, color=["teal", "orange", "purple"])
plt.title("Average Sentiment by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Average Sentiment", fontsize=12)
plt.ylim(0, 1)  # Sentiment is likely normalized between 0 and 1
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
No description has been provided for this image
In [211]:
# Heatmap for Aspect Issues
plt.figure(figsize=(10, 6))
sns.heatmap(
    aspect_issues_df,
    annot=True,
    cmap="YlGnBu",
    fmt=".2f",
    cbar_kws={"label": "Mean Value"},
)
plt.title("Aspect Issues Across Tourist Types", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Tourist Type", fontsize=12)
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [212]:
# Bar chart for Review Counts
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, review_counts, color=["skyblue", "salmon", "lightgreen"])
plt.title("Review Counts by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Review Count", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()
No description has been provided for this image
In [213]:
def compare_regions(df):
    # Group by location
    location_stats = (
        df.groupby("location")
        .agg(
            {
                "ml_sentiment_score": ["mean", "std"],
                "accessibility": "mean",
                "accommodation": "mean",
                "culture": "mean",
                "food": "mean",
                "nature": "mean",
                "religion": "mean",
                "safety": "mean",
            }
        )
        .sort_values(("ml_sentiment_score", "mean"), ascending=False)
    )

    # Identify top complaints for each location
    complaint_keywords = df.groupby("location")["cleaned_review"].apply(
        lambda texts: texts.str.lower().str.split().explode().value_counts().head(5)
    )

    return location_stats, complaint_keywords


region_stats, location_complaints = compare_regions(df)
print("\nRegional Sentiment Comparison:")
print(region_stats)
print("\nTop Complaints by Region:")
print(location_complaints)
Regional Sentiment Comparison:
                        ml_sentiment_score           accessibility  \
                                      mean       std          mean   
location                                                             
Pokhara                           0.830128  0.405827      0.017094   
Syambhunath                       0.727957  0.454819      0.046237   
Chitwan National Park             0.723656  0.477682      0.018280   
Langtang                          0.704309  0.475841      0.031204   
Bardiya National Park             0.680672  0.480824      0.018487   
Bhaktapur Durbar Square           0.664516  0.490303      0.010753   
Pasupatinath Temple               0.540860  0.529991      0.007527   
Annapurna Base Camp               0.484848  0.682215      0.030303   
Lumbini                           0.473797  0.518510      0.009626   
Everest Base Camp Trek            0.405405  0.692537      0.090090   

                        accommodation   culture      food    nature  religion  \
                                 mean      mean      mean      mean      mean   
location                                                                        
Pokhara                      0.051282  0.071581  0.141026  0.533120  0.073718   
Syambhunath                  0.012903  0.354839  0.019355  0.366667  0.541935   
Chitwan National Park        0.064516  0.079570  0.019355  0.154839  0.016129   
Langtang                     0.029718  0.026746  0.016345  0.239227  0.002972   
Bardiya National Park        0.045378  0.013445  0.008403  0.097479  0.006723   
Bhaktapur Durbar Square      0.027957  0.411828  0.078495  0.030108  0.191398   
Pasupatinath Temple          0.013978  0.523656  0.002151  0.058065  0.550538   
Annapurna Base Camp          0.022727  0.000000  0.022727  0.348485  0.000000   
Lumbini                      0.041711  0.217112  0.003209  0.019251  0.225668   
Everest Base Camp Trek       0.036036  0.000000  0.018018  0.207207  0.000000   

                           safety  
                             mean  
location                           
Pokhara                  0.010684  
Syambhunath              0.006452  
Chitwan National Park    0.013978  
Langtang                 0.005944  
Bardiya National Park    0.011765  
Bhaktapur Durbar Square  0.004301  
Pasupatinath Temple      0.007527  
Annapurna Base Camp      0.007576  
Lumbini                  0.007487  
Everest Base Camp Trek   0.018018  

Top Complaints by Region:
location                           
Annapurna Base Camp      place          37
                         annapurna      35
                         amazing        25
                         trek           22
                         one            21
Bardiya National Park    place         210
                         park          162
                         national      135
                         bardiya        77
                         good           71
Bhaktapur Durbar Square  place         693
                         bhaktapur     306
                         square        250
                         visit         240
                         durbar        217
Chitwan National Park    place         431
                         park          383
                         national      288
                         safari        248
                         one           231
Everest Base Camp Trek   everest        31
                         base           29
                         camp           29
                         experience     22
                         place          20
Langtang                 place         216
                         beautiful     111
                         trekking       92
                         park           92
                         langtang       86
Lumbini                  place         718
                         buddha        411
                         temple        233
                         birth         232
                         lord          217
Pasupatinath Temple      temple        624
                         place         477
                         visit         256
                         hindu         234
                         one           205
Pokhara                  place         641
                         lake          414
                         beautiful     257
                         pokhara       243
                         visit         177
Syambhunath              place         634
                         kathmandu     478
                         temple        305
                         view          274
                         visit         240
Name: cleaned_review, dtype: int64
In [214]:
# Regional Sentiment Comparison
plt.figure(figsize=(10, 6))
sentiment_data = region_stats[("ml_sentiment_score", "mean")].sort_values()
sentiment_data.plot(kind="barh", color="skyblue", edgecolor="black")
plt.title("Mean Sentiment Scores by Region", fontsize=14)
plt.xlabel("Mean Sentiment Score", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [215]:
# Aspect Metrics Heatmap
plt.figure(figsize=(12, 8))
aspect_data = region_stats.xs("mean", axis=1, level=1).iloc[
    :, 1:
]  # Exclude sentiment score
sns.heatmap(aspect_data, annot=True, cmap="coolwarm", cbar_kws={"label": "Mean Value"})
plt.title("Aspect Metrics by Region", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [216]:
# Correlation between sentiment and aspects
correlations = region_stats.corr()["ml_sentiment_score"]["mean"]
print("Correlation between sentiment and aspects:\n", correlations)
Correlation between sentiment and aspects:
 ml_sentiment_score  mean    1.000000
                    std    -0.877118
accessibility       mean   -0.380292
accommodation       mean    0.298009
culture             mean    0.015219
food                mean    0.559874
nature              mean    0.423403
religion            mean    0.009081
safety              mean   -0.214823
Name: mean, dtype: float64
In [217]:
df.describe()
Out[217]:
ID total_review total_review_count ml_sentiment ensemble_sentiment lexicon_sentiment final_sentiment trust_score sentiment_polarity ml_sentiment_score pca1 pca2
count 7102.000000 7102.000000 7102.000000 7102.000000 7102.000000 7102.000000 7102.000000 7102.000000 7102.000000 7102.000000 7.102000e+03 7.102000e+03
mean 3631.451563 80.566742 80.566742 1.415517 1.469023 0.527836 1.418051 0.080615 0.384124 0.658828 4.001930e-18 4.802316e-17
std 2117.239644 119.807382 119.807382 0.881742 0.810323 0.347161 0.878782 0.121385 0.318223 0.502121 1.443194e-01 1.361941e-01
min 1.000000 1.000000 1.000000 -2.000000 -2.000000 -0.851900 -2.000000 0.000000 -1.000000 -1.000000 -2.904202e-01 -4.222869e-01
25% 1809.250000 13.000000 13.000000 1.000000 1.000000 0.340000 1.000000 0.012158 0.100000 0.000000 -9.846385e-02 -4.854880e-02
50% 3584.500000 38.000000 38.000000 2.000000 2.000000 0.624900 2.000000 0.037487 0.386970 1.000000 -3.502423e-02 -4.703180e-03
75% 5495.750000 94.000000 94.000000 2.000000 2.000000 0.812600 2.000000 0.094225 0.600000 1.000000 6.933070e-02 2.404595e-02
max 7271.000000 988.000000 988.000000 2.000000 2.000000 0.989500 2.000000 1.000000 1.000000 1.000000 7.903685e-01 8.742405e-01
In [218]:
# Prepare data
correlation_values = correlations[1:]  # Exclude "mean" which is always 1.0
aspects = correlation_values.index.get_level_values(0)

# Plot
plt.figure(figsize=(10, 6))
correlation_values.plot(kind="barh", color="steelblue", edgecolor="black")
plt.axvline(0, color="black", linewidth=0.8, linestyle="--")
plt.title("Correlation Between Sentiment and Aspects", fontsize=14)
plt.xlabel("Correlation Coefficient", fontsize=12)
plt.ylabel("Aspects", fontsize=12)
plt.tight_layout()
plt.show()
No description has been provided for this image
In [219]:
# Print the data frame to check the changes
df.head()
Out[219]:
ID location total_review review cleaned_review total_review_count review_tokens ml_sentiment ensemble_sentiment lexicon_sentiment ... accessibility accommodation culture food nature religion safety pca1 pca2 trust_segment
0 1 Syambhunath 46.0 It is at the top of valleys mountain. Best pl... top valleys mountain best place get pleasure r... 46 [top, valleys, mountain, best, place, get, ple... 2 2 0.9738 ... False False False False True False False 0.086252 -0.073357 Medium
1 2 Syambhunath 132.0 This place has a significant importance in Bud... place significant importance buddhism visited ... 132 [place, significant, importance, buddhism, vis... 2 2 0.9325 ... False False False False False True False 0.158235 -0.086863 Very High
2 3 Syambhunath 298.0 Visited this from the other side on a rainy ev... visited side rainy evening actually visit temp... 298 [visited, side, rainy, evening, actually, visi... 2 2 0.8860 ... False False True False True True False 0.039676 -0.027877 Very High
3 4 Syambhunath 247.0 A beautiful temple situated in the capital wit... beautiful temple situated capital stunning vie... 247 [beautiful, temple, situated, capital, stunnin... 2 2 0.9531 ... False False True False False True False -0.088295 -0.026823 Very High
4 5 Syambhunath 69.0 great, beautiful, historic & religious place..... great beautiful historic religious place crowd... 69 [great, beautiful, historic, religious, place,... 2 2 0.9468 ... False False False False True True False 0.080956 -0.090689 High

5 rows × 25 columns

FIN¶