from collections import Counter
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import LinearSVC
from textblob import TextBlob
import re
import seaborn as sns
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
from wordcloud import WordCloud

# Load the data
df = pd.read_csv("nepal_tourist_reviews.csv")

# Basic data exploration
print("Dataset Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())

Dataset Shape: (7271, 4)

Columns: ['ID', 'location', 'total review', 'review']

Missing Values:
 ID                0
location          0
total review     38
review          169
dtype: int64

# Drop missing values
df.dropna(inplace=True)

# Rename 'total review' to 'total_review'
df.rename(columns={"total review": "total_review"}, inplace=True)

# Print the data frame to check the changes
df.head()

# Check the missing values
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())

Dataset Shape: (7102, 4)

Missing Values:
 ID              0
location        0
total_review    0
review          0
dtype: int64

# Download stopwords if not already present
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))


def preprocess_text(text):
    # Lowercase and tokenize
    words = re.findall(r"\b\w+\b", text.lower())
    # Remove stop words
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


# Apply the preprocessing function to the reviews
df["cleaned_review"] = df["review"].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anupamabhatta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

# Print the data frame to check the changes
df.head()

# Clean the 'total_review' column - convert to string, extract numbers and handle NaN values
df["total_review_count"] = pd.to_numeric(
    df["total_review"].astype(str).str.extract(r"(\d+)")[0],
    errors="coerce",  # This will convert errors to NaN
)

# Basic statistics of review counts
print("\nReview Count Statistics:")
print(df["total_review_count"].describe())

Review Count Statistics:
count    7102.000000
mean       80.566742
std       119.807382
min         1.000000
25%        13.000000
50%        38.000000
75%        94.000000
max       988.000000
Name: total_review_count, dtype: float64

# Location analysis
print("\nNumber of Unique Locations:", df["location"].nunique())
print("\nTop 10 Locations by Frequency:")
print(df["location"].value_counts())

Number of Unique Locations: 10

Top 10 Locations by Frequency:
location
Pokhara                    936
Lumbini                    935
Syambhunath                930
Bhaktapur Durbar Square    930
Chitwan National Park      930
Pasupatinath Temple        930
Langtang                   673
Bardiya National Park      595
Annapurna Base Camp        132
Everest Base Camp Trek     111
Name: count, dtype: int64

# Let's also look at the distribution of reviews per location
location_review_stats = (
    df.groupby("location")
    .agg(
        {
            "ID": "count",  # Number of reviews per location
            "total_review_count": [
                "mean",
                "min",
                "max",
            ],  # Statistics of the 'total review' numbers
        }
    )
    .round(2)
)

print("\nLocation Statistics:")
print(location_review_stats)

Location Statistics:
                           ID total_review_count         
                        count               mean min  max
location                                                 
Annapurna Base Camp       132              95.93   1  834
Bardiya National Park     595              73.95   1  874
Bhaktapur Durbar Square   930              81.09   1  917
Chitwan National Park     930              82.14   1  897
Everest Base Camp Trek    111              90.84   1  674
Langtang                  673              77.23   1  916
Lumbini                   935              90.99   1  917
Pasupatinath Temple       930              70.68   1  936
Pokhara                   936              72.19   1  908
Syambhunath               930              89.55   1  988

# Print the data frame to check the changes
df.head()

# Distribution of total review counts per location
plt.figure(figsize=(10, 6))
sns.boxplot(
    data=df,
    x="location",
    y="total_review_count",
    hue="location",
    palette="viridis",
    legend=False,
)
plt.title("Distribution of Total Reviews per Location", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Total Review Count", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Bar chart of top locations by review frequency
plt.figure(figsize=(10, 6))
sns.barplot(
    x=df["location"].value_counts().index,
    y=df["location"].value_counts().values,
    hue=df["location"].value_counts().index,
    palette="coolwarm",
    legend=False,
)
plt.title("Top Locations by Number of Reviews", fontsize=14)
plt.xlabel("Location", fontsize=12)
plt.ylabel("Number of Reviews", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Histogram of total review counts
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x="total_review_count", bins=30, kde=True, color="skyblue")
plt.title("Distribution of Total Review Counts", fontsize=14)
plt.xlabel("Total Review Count", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.tight_layout()
plt.show()

# Tokenize reviews and remove stopwords
stop_words = set(stopwords.words("english"))
df["review_tokens"] = (
    df["cleaned_review"]
    .dropna()
    .apply(
        lambda x: [
            word.lower()
            for word in word_tokenize(x)
            if word.isalpha() and word.lower() not in stop_words
        ]
    )
)

# Flatten the list of tokens to count word frequencies
all_tokens = [word for tokens in df["review_tokens"].dropna() for word in tokens]
word_counts = Counter(all_tokens)

# Top 30 most common words
common_words = word_counts.most_common(30)
print("\nTop 30 Most Common Words:", common_words)

# Visualize word frequency
words, counts = zip(*common_words)
plt.figure(figsize=(10, 6))
sns.barplot(
    x=list(words), y=list(counts), hue=list(words), palette="tab10", legend=False
)
plt.title("Top 30 Most Common Words in Reviews", fontsize=14)
plt.xlabel("Word", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

Top 30 Most Common Words: [('place', 4077), ('visit', 1353), ('temple', 1284), ('nepal', 1131), ('one', 1116), ('beautiful', 1084), ('kathmandu', 812), ('best', 771), ('good', 687), ('park', 648), ('nice', 585), ('see', 582), ('great', 565), ('world', 563), ('amazing', 539), ('view', 516), ('national', 510), ('heritage', 487), ('must', 479), ('site', 462), ('buddha', 451), ('also', 446), ('lake', 432), ('time', 408), ('many', 400), ('lord', 396), ('around', 365), ('peaceful', 356), ('people', 351), ('valley', 338)]

# Print the data frame to check the changes
df.head()

class TourismSentimentClassifier:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=5000, ngram_range=(1, 2), stop_words="english"
        )

        # Initialize all classifiers
        self.classifiers = {
            "naive_bayes": MultinomialNB(),
            "svm": LinearSVC(random_state=42),
            "logistic_regression": LogisticRegression(random_state=42),
            "random_forest": RandomForestClassifier(n_estimators=100, random_state=42),
            "knn": KNeighborsClassifier(n_neighbors=5),
        }

        self.trained_models = {}
        self.best_model = None
        self.best_model_name = None

    # Prepare initial labels using lexicon-based approach
    def prepare_initial_labels(self, df):
        analyzer = SentimentIntensityAnalyzer()

        def get_initial_label(text):
            scores = analyzer.polarity_scores(text)
            if scores["compound"] >= 0.5:
                return 2  # very positive
            elif scores["compound"] >= 0.1:
                return 1  # positive
            elif scores["compound"] <= -0.5:
                return -2  # very negative
            elif scores["compound"] <= -0.1:
                return -1  # negative
            return 0  # neutral

        return df["cleaned_review"].apply(get_initial_label)

    # Train and evaluate all classifiers
    def train_and_evaluate(self, df):
        # Prepare initial labels
        y = self.prepare_initial_labels(df)
        X = df["cleaned_review"]

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )

        # Train and evaluate each classifier
        results = {}
        for name, classifier in self.classifiers.items():
            # Create pipeline
            pipeline = Pipeline(
                [("vectorizer", self.vectorizer), ("classifier", classifier)]
            )

            # Train
            pipeline.fit(X_train, y_train)

            # Evaluate
            score = pipeline.score(X_test, y_test)
            y_pred = pipeline.predict(X_test)
            report = classification_report(y_test, y_pred)

            # Store results
            results[name] = {"score": score, "report": report, "model": pipeline}

            self.trained_models[name] = pipeline

        # Find best model
        best_score = -1
        for name, result in results.items():
            if result["score"] > best_score:
                best_score = result["score"]
                self.best_model = result["model"]
                self.best_model_name = name

        return results

    # Make predictions using the best model
    def predict(self, text):
        if self.best_model is None:
            raise ValueError("Models haven't been trained yet!")

        return self.best_model.predict([text])[0]

    # Make predictions using ensemble of all models
    def ensemble_predict(self, text):
        if not self.trained_models:
            raise ValueError("Models haven't been trained yet!")

        predictions = []
        for model in self.trained_models.values():
            pred = model.predict([text])[0]
            predictions.append(pred)

        return max(set(predictions), key=predictions.count)

# Comprehensive analysis of reviews using both ML and lexicon-based approaches
def analyze_reviews(df):
    # Initialize and train ML classifiers
    classifier = TourismSentimentClassifier()
    results = classifier.train_and_evaluate(df)

    # Add ML predictions to dataframe
    df["ml_sentiment"] = df["cleaned_review"].apply(classifier.predict)
    df["ensemble_sentiment"] = df["cleaned_review"].apply(classifier.ensemble_predict)

    # Add lexicon-based predictions (from previous implementation)
    analyzer = SentimentIntensityAnalyzer()
    df["lexicon_sentiment"] = df["cleaned_review"].apply(
        lambda x: analyzer.polarity_scores(x)["compound"]
    )

    # Combine predictions
    def get_final_sentiment(row):
        ml_sent = row["ml_sentiment"]
        lex_sent = row["lexicon_sentiment"]
        ensemble_sent = row["ensemble_sentiment"]

        # Weight and combine different approaches
        if abs(ml_sent - ensemble_sent) <= 1:  # ML models agree
            return ml_sent
        else:  # Use lexicon as tiebreaker
            return (
                2
                if lex_sent >= 0.5
                else 1
                if lex_sent >= 0.1
                else -2
                if lex_sent <= -0.5
                else -1
                if lex_sent <= -0.1
                else 0
            )

    df["final_sentiment"] = df.apply(get_final_sentiment, axis=1)

    return df, results

# Get insights about model performance
def get_model_insights(results):
    """
    Get insights about model performance
    """
    insights = {"best_model": None, "model_comparison": {}, "feature_importance": {}}

    best_score = -1
    for name, result in results.items():
        score = result["score"]
        insights["model_comparison"][name] = {
            "accuracy": score,
            "detailed_report": result["report"],
        }

        if score > best_score:
            best_score = score
            insights["best_model"] = name

    return insights

# Analyze reviews using both ML and lexicon-based approaches
df, model_results = analyze_reviews(df)

# Get insights about model performance
insights = get_model_insights(model_results)

print(f"Best performing model: {insights['best_model']}")
for model, metrics in insights["model_comparison"].items():
    print(f"\n{model} accuracy: {metrics['accuracy']:.3f}")

/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/svm/_classes.py:31: FutureWarning: The default value of `dual` will change from `True` to `'auto'` in 1.5. Set the value of `dual` explicitly to suppress the warning.
  warnings.warn(

Best performing model: svm

naive_bayes accuracy: 0.685

svm accuracy: 0.820

logistic_regression accuracy: 0.787

random_forest accuracy: 0.807

knn accuracy: 0.331

# Print the data frame to check the changes
df.head()

class TourismSentimentAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(
            max_features=5000,
            ngram_range=(1, 2),
            stop_words="english",
            min_df=5,  # Ignore terms that appear in less than 5 documents
        )
        # Enhanced SVM parameters
        self.classifier = LinearSVC(
            dual=False,
            random_state=42,
            class_weight="balanced",
            C=1.0,  # Regularization parameter
            tol=1e-4,  # Tolerance for stopping criterion
        )
        self.scaler = MinMaxScaler()

    def fit(self, X, y):
        """Train the model with given data"""
        X_transformed = self.vectorizer.fit_transform(X)
        self.classifier.fit(X_transformed, y)
        return self

    def predict(self, X):
        """Predict sentiment for new data"""
        X_transformed = self.vectorizer.transform(X)
        return self.classifier.predict(X_transformed)


def improved_preprocess_data(df):
    """
    Enhanced preprocessing pipeline with improved SVM-based sentiment analysis
    """
    # Handle warnings
    warnings.filterwarnings("ignore", category=UserWarning)

    # Extract numeric part from 'total_review' and convert to float
    df["total_review"] = df["total_review"].str.extract(r"(\d+)").astype(float)

    # Calculate trust scores
    scaler = MinMaxScaler()
    df["trust_score"] = scaler.fit_transform(df[["total_review"]])

    # Calculate sentiment polarity using TextBlob
    df["sentiment_polarity"] = df["cleaned_review"].apply(
        lambda x: TextBlob(str(x)).sentiment.polarity
    )

    # Initialize and train the sentiment analyzer
    analyzer = TourismSentimentAnalyzer()

    # Create initial labels for training using TextBlob
    initial_sentiments = df["cleaned_review"].apply(
        lambda x: 1
        if TextBlob(str(x)).sentiment.polarity > 0.2
        else -1
        if TextBlob(str(x)).sentiment.polarity < -0.2
        else 0
    )

    # Train the SVM model
    analyzer.fit(df["cleaned_review"], initial_sentiments)

    # Apply ML-based sentiment analysis
    df["ml_sentiment_score"] = analyzer.predict(df["cleaned_review"])

    # Convert ML scores to final classification (positive/negative/neutral)
    df["review_class"] = df["ml_sentiment_score"].map(
        {1: "positive", 0: "neutral", -1: "negative"}
    )

    return df


# Function to apply the sentiment analysis to your main DataFrame
def apply_sentiment_analysis(df):
    """
    Apply the improved sentiment analysis to the main DataFrame
    """
    # Make sure the DataFrame has the required columns
    if "cleaned_review" not in df.columns:
        raise ValueError("DataFrame must contain 'cleaned_review' column")

    # Apply the improved preprocessing
    processed_df = improved_preprocess_data(df)

    return processed_df

# Apply the sentiment analysis
df = apply_sentiment_analysis(df)

# Now df contains:
# - trust_score: normalized score based on total_review
# - ml_sentiment_score: raw SVM predictions
# - review_class: final classification as "positive", "negative", or "neutral"

# Display the distribution of sentiment classes
print("\nSentiment Distribution:")
print(df["review_class"].value_counts())

Sentiment Distribution:
review_class
positive    4776
neutral     2229
negative      97
Name: count, dtype: int64

# Print the data frame to check the changes
df.head()

def extract_patterns(df):
    # Create aspects dictionary
    aspects = {
        "accessibility": [
            "accessibility",
            "transport",
            "transportation",
            "road",
            "roads",
            "access",
            "reach",
            "distance",
        ],
        "accommodation": [
            "hotel",
            "bed",
            "room",
            "stay",
            "house",
            "accommodation",
            "garden",
        ],
        "culture": [
            "temple",
            "culture",
            "tradition",
            "heritage",
            "Buddha",
            "Buddhism",
            "Buddhist",
            "Hindu",
            "Hinduism",
        ],
        "food": ["food", "restaurant", "breakfast", "dinner", "delicious", "cuisine"],
        "nature": [
            "mountain",
            "mountains",
            "view",
            "lake",
            "scenery",
            "landscape",
            "nature",
            "heaven",
            "Himalayas",
            "Everest",
            "Sagarmatha",
        ],
        "religion": [
            "temple",
            "monastery",
            "stupa",
            "Stupa",
            "Spirituality",
            "pagoda",
            "shrine",
            "religion",
            "religious",
            "monk",
            "nun",
            "priest",
        ],
        "safety": [
            "safe",
            "security",
            "dangerous",
            "risk",
            "risky",
            "safety",
            "secure",
            "alert",
            "crime",
            "scam",
            "fraud",
        ],
    }

    # Extract aspects from reviews (similar to association rule mining)
    for aspect, keywords in aspects.items():
        df[aspect] = (
            df["cleaned_review"]
            .fillna("")
            .apply(lambda x: any(word in str(x).lower() for word in keywords))
        )

    return df


# Extract patterns from reviews
df = extract_patterns(df)
df

def calculate_similarities(df):
    # TF-IDF Vectorization
    tfidf = TfidfVectorizer(max_features=1000, stop_words="english")
    text_matrix = tfidf.fit_transform(df["cleaned_review"].fillna(""))

    # PCA for dimension reduction
    pca = PCA(n_components=2)
    text_pca = pca.fit_transform(text_matrix.toarray())
    df["pca1"] = text_pca[:, 0]
    df["pca2"] = text_pca[:, 1]

    return df, tfidf.get_feature_names_out()


# Calculate similarities using TF-IDF and PCA
df, feature_names = calculate_similarities(df)
df

def create_aspect_network(df):
    G = nx.Graph()
    aspects = [
        "accessibility",
        "accommodation",
        "culture",
        "food",
        "nature",
        "religion",
        "safety",
    ]

    # Create edges between aspects that commonly co-occur
    for i in range(len(aspects)):
        for j in range(i + 1, len(aspects)):
            weight = df[df[aspects[i]] & df[aspects[j]]].shape[0]
            if weight > 0:
                G.add_edge(aspects[i], aspects[j], weight=weight)

    return G


# Create aspect network
G = create_aspect_network(df)
G.edges(data=True)

# Display EdgeDataView in a table format
print("\nAspect Network:")
aspect = pd.DataFrame(G.edges(data=True), columns=["Aspect1", "Aspect2", "Weight"])
aspect

Aspect Network:

# Draw the aspect network
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)
nx.draw_networkx_nodes(G, pos, node_size=1000, node_color="skyblue")
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5, edge_color="gray")
nx.draw_networkx_labels(G, pos, font_size=12, font_color="black")
plt.title("Aspect Network", fontsize=14)
plt.axis("off")
plt.tight_layout()
plt.show()

def analyze_by_trust_segments(df):
    df["trust_segment"] = pd.qcut(
        df["trust_score"],
        q=5,
        labels=["Very Low", "Low", "Medium", "High", "Very High"],
    )
    segment_analysis = df.groupby("trust_segment", observed=False).agg(
        {
            "ml_sentiment_score": "mean",
            "accessibility": "mean",
            "accommodation": "mean",
            "culture": "mean",
            "food": "mean",
            "nature": "mean",
            "religion": "mean",
            "safety": "mean",
        }
    )
    return segment_analysis


# Analyze by trust score segments
segment_analysis = analyze_by_trust_segments(df)
segment_analysis

# Print the data frame to check the changes
df.head()

# Generate insights
insights = {
    "overall_sentiment": df["ml_sentiment_score"].mean(),
    "positive_reviews": (df["review_class"] == "positive").mean(),
    "top_locations": df.groupby("location")["ml_sentiment_score"]
    .mean()
    .sort_values(ascending=False),
    "aspect_frequencies": df[
        [
            "accessibility",
            "accommodation",
            "culture",
            "food",
            "nature",
            "religion",
            "safety",
        ]
    ].mean(),
    "trust_segment_analysis": segment_analysis,
}

# Print key insights
print(f"Overall sentiment: {insights['overall_sentiment']:.3f}")
print("\nPositive review percentage: {:.1%}".format(insights["positive_reviews"]))

Overall sentiment: 0.659

Positive review percentage: 67.2%

def investigate_negative_reviews(df):
    # Filter for negative and neutral reviews
    negative_reviews = df[df["review_class"] == "negative"]

    # Analyze aspect mentions in negative/neutral reviews
    aspect_issues = negative_reviews[
        [
            "accessibility",
            "accommodation",
            "culture",
            "food",
            "nature",
            "religion",
            "safety",
        ]
    ].mean()

    # Analyze sentiment score distribution
    sentiment_distribution = negative_reviews["ml_sentiment_score"].describe()

    # Extract common complaint phrases
    top_complaints = (
        negative_reviews["cleaned_review"]
        .str.lower()
        .str.split()
        .explode()
        .value_counts()
        .head(30)
    )

    return {
        "aspect_issues": aspect_issues,
        "sentiment_distribution": sentiment_distribution,
        "top_complaints": top_complaints,
    }


# Investigate negative reviews
negative_insights = investigate_negative_reviews(df)
print("\nNegative Review Insights:")
print("Aspect Issues:")
print(negative_insights["aspect_issues"])
print("\nSentiment Distribution:")
print(negative_insights["sentiment_distribution"])
print("\nTop Complaint Phrases:")
print(negative_insights["top_complaints"])

Negative Review Insights:
Aspect Issues:
accessibility    0.051546
accommodation    0.041237
culture          0.134021
food             0.020619
nature           0.144330
religion         0.134021
safety           0.000000
dtype: float64

Sentiment Distribution:
count    97.0
mean     -1.0
std       0.0
min      -1.0
25%      -1.0
50%      -1.0
75%      -1.0
max      -1.0
Name: ml_sentiment_score, dtype: float64

Top Complaint Phrases:
cleaned_review
place         33
base          29
camp          27
everest       14
visit         10
nepal         10
see           10
annapurna      9
trek           9
temple         9
least          8
elephants      7
life           7
time           7
go             7
one            7
dirty          7
even           7
expensive      6
like           6
park           6
must           6
get            6
jungle         6
take           5
earthquake     5
walk           5
never          5
guide          5
safari         5
Name: count, dtype: int64

def plot_aspect_issues(aspect_issues):
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=aspect_issues.index,
        y=aspect_issues.values,
        hue=aspect_issues.index,
        palette="viridis",
        legend=False,
    )
    plt.title("Aspect Issues in Negative Reviews", fontsize=16)
    plt.ylabel("Average Mention Rate", fontsize=12)
    plt.xlabel("Aspects", fontsize=12)
    plt.show()


# Plot aspect issues in negative reviews
plot_aspect_issues(negative_insights["aspect_issues"])

def plot_wordcloud(top_complaints):
    wordcloud = WordCloud(
        width=800, height=400, background_color="white"
    ).generate_from_frequencies(top_complaints.to_dict())
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Top Complaint Phrases", fontsize=16)
    plt.show()


# Plot word cloud of top complaint phrases
plot_wordcloud(negative_insights["top_complaints"])

def generate_positive_insights(df):
    # Filter for positive reviews
    positive_reviews = df[df["review_class"] == "positive"]

    # Extract common positive phrases
    positive_phrases = (
        positive_reviews["cleaned_review"]
        .str.lower()
        .str.split()
        .explode()
        .value_counts()
        .head(20)
    )

    return positive_phrases


# Generate positive review insights
positive_phrases = generate_positive_insights(df)


# Plot word cloud of positive phrases
def plot_wordcloud_positive(positive_phrases):
    wordcloud = WordCloud(
        width=800, height=400, background_color="white", colormap="viridis"
    ).generate_from_frequencies(positive_phrases.to_dict())
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.title("Positive Phrases Word Cloud", fontsize=16)
    plt.show()


plot_wordcloud_positive(positive_phrases)

def segment_reviews_by_type(df, tourist_types):
    segmented_data = {}

    for tourist_type, keywords in tourist_types.items():
        # Filter reviews containing keywords for this tourist type
        mask = df["cleaned_review"].str.contains(
            "|".join(keywords), case=False, na=False
        )
        type_reviews = df[mask]

        # Calculate sentiment and aspect mentions for this type
        sentiment = type_reviews["ml_sentiment_score"].mean()
        aspect_issues = type_reviews[
            [
                "accessibility",
                "accommodation",
                "culture",
                "food",
                "nature",
                "religion",
                "safety",
            ]
        ].mean()

        # Store results
        segmented_data[tourist_type] = {
            "review_count": type_reviews.shape[0],
            "average_sentiment": sentiment,
            "aspect_issues": aspect_issues,
        }

    return segmented_data


# Define tourist types and associated keywords
tourist_types = {
    "adventure": ["trek", "hike", "climb", "adventure"],
    "cultural": ["temple", "heritage", "festival", "culture"],
    "religious": ["pilgrimage", "sacred", "shrine", "god", "prayer"],
}
tourist_segment_analysis = segment_reviews_by_type(df, tourist_types)
print("\nTourist Segment Analysis:")
for type_, data in tourist_segment_analysis.items():
    print(f"{type_} - Average Sentiment: {data['average_sentiment']:.3f}")
    print(f"Aspect Issues:\n{data['aspect_issues']}")

Tourist Segment Analysis:
adventure - Average Sentiment: 0.711
Aspect Issues:
accessibility    0.084986
accommodation    0.062323
culture          0.076487
food             0.056657
nature           0.382436
religion         0.093484
safety           0.016997
dtype: float64
cultural - Average Sentiment: 0.595
Aspect Issues:
accessibility    0.021879
accommodation    0.035393
culture          0.992921
food             0.041184
nature           0.157658
religion         0.726512
safety           0.005792
dtype: float64
religious - Average Sentiment: 0.439
Aspect Issues:
accessibility    0.022305
accommodation    0.074349
culture          0.591078
food             0.007435
nature           0.115242
religion         0.680297
safety           0.000000
dtype: float64

# Prepare data for visualization
tourist_types = list(tourist_segment_analysis.keys())
review_counts = [data["review_count"] for data in tourist_segment_analysis.values()]
average_sentiments = [
    data["average_sentiment"] for data in tourist_segment_analysis.values()
]
aspect_issues_df = pd.DataFrame(
    [data["aspect_issues"] for data in tourist_segment_analysis.values()],
    index=tourist_types,
)

# Bar chart for Average Sentiment Scores
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, average_sentiments, color=["teal", "orange", "purple"])
plt.title("Average Sentiment by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Average Sentiment", fontsize=12)
plt.ylim(0, 1)  # Sentiment is likely normalized between 0 and 1
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

# Heatmap for Aspect Issues
plt.figure(figsize=(10, 6))
sns.heatmap(
    aspect_issues_df,
    annot=True,
    cmap="YlGnBu",
    fmt=".2f",
    cbar_kws={"label": "Mean Value"},
)
plt.title("Aspect Issues Across Tourist Types", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Tourist Type", fontsize=12)
plt.xticks(rotation=45)
plt.show()

# Bar chart for Review Counts
plt.figure(figsize=(8, 5))
plt.bar(tourist_types, review_counts, color=["skyblue", "salmon", "lightgreen"])
plt.title("Review Counts by Tourist Type", fontsize=14)
plt.xlabel("Tourist Type", fontsize=12)
plt.ylabel("Review Count", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

def compare_regions(df):
    # Group by location
    location_stats = (
        df.groupby("location")
        .agg(
            {
                "ml_sentiment_score": ["mean", "std"],
                "accessibility": "mean",
                "accommodation": "mean",
                "culture": "mean",
                "food": "mean",
                "nature": "mean",
                "religion": "mean",
                "safety": "mean",
            }
        )
        .sort_values(("ml_sentiment_score", "mean"), ascending=False)
    )

    # Identify top complaints for each location
    complaint_keywords = df.groupby("location")["cleaned_review"].apply(
        lambda texts: texts.str.lower().str.split().explode().value_counts().head(5)
    )

    return location_stats, complaint_keywords


region_stats, location_complaints = compare_regions(df)
print("\nRegional Sentiment Comparison:")
print(region_stats)
print("\nTop Complaints by Region:")
print(location_complaints)

Regional Sentiment Comparison:
                        ml_sentiment_score           accessibility  \
                                      mean       std          mean   
location                                                             
Pokhara                           0.830128  0.405827      0.017094   
Syambhunath                       0.727957  0.454819      0.046237   
Chitwan National Park             0.723656  0.477682      0.018280   
Langtang                          0.704309  0.475841      0.031204   
Bardiya National Park             0.680672  0.480824      0.018487   
Bhaktapur Durbar Square           0.664516  0.490303      0.010753   
Pasupatinath Temple               0.540860  0.529991      0.007527   
Annapurna Base Camp               0.484848  0.682215      0.030303   
Lumbini                           0.473797  0.518510      0.009626   
Everest Base Camp Trek            0.405405  0.692537      0.090090   

                        accommodation   culture      food    nature  religion  \
                                 mean      mean      mean      mean      mean   
location                                                                        
Pokhara                      0.051282  0.071581  0.141026  0.533120  0.073718   
Syambhunath                  0.012903  0.354839  0.019355  0.366667  0.541935   
Chitwan National Park        0.064516  0.079570  0.019355  0.154839  0.016129   
Langtang                     0.029718  0.026746  0.016345  0.239227  0.002972   
Bardiya National Park        0.045378  0.013445  0.008403  0.097479  0.006723   
Bhaktapur Durbar Square      0.027957  0.411828  0.078495  0.030108  0.191398   
Pasupatinath Temple          0.013978  0.523656  0.002151  0.058065  0.550538   
Annapurna Base Camp          0.022727  0.000000  0.022727  0.348485  0.000000   
Lumbini                      0.041711  0.217112  0.003209  0.019251  0.225668   
Everest Base Camp Trek       0.036036  0.000000  0.018018  0.207207  0.000000   

                           safety  
                             mean  
location                           
Pokhara                  0.010684  
Syambhunath              0.006452  
Chitwan National Park    0.013978  
Langtang                 0.005944  
Bardiya National Park    0.011765  
Bhaktapur Durbar Square  0.004301  
Pasupatinath Temple      0.007527  
Annapurna Base Camp      0.007576  
Lumbini                  0.007487  
Everest Base Camp Trek   0.018018  

Top Complaints by Region:
location                           
Annapurna Base Camp      place          37
                         annapurna      35
                         amazing        25
                         trek           22
                         one            21
Bardiya National Park    place         210
                         park          162
                         national      135
                         bardiya        77
                         good           71
Bhaktapur Durbar Square  place         693
                         bhaktapur     306
                         square        250
                         visit         240
                         durbar        217
Chitwan National Park    place         431
                         park          383
                         national      288
                         safari        248
                         one           231
Everest Base Camp Trek   everest        31
                         base           29
                         camp           29
                         experience     22
                         place          20
Langtang                 place         216
                         beautiful     111
                         trekking       92
                         park           92
                         langtang       86
Lumbini                  place         718
                         buddha        411
                         temple        233
                         birth         232
                         lord          217
Pasupatinath Temple      temple        624
                         place         477
                         visit         256
                         hindu         234
                         one           205
Pokhara                  place         641
                         lake          414
                         beautiful     257
                         pokhara       243
                         visit         177
Syambhunath              place         634
                         kathmandu     478
                         temple        305
                         view          274
                         visit         240
Name: cleaned_review, dtype: int64

# Regional Sentiment Comparison
plt.figure(figsize=(10, 6))
sentiment_data = region_stats[("ml_sentiment_score", "mean")].sort_values()
sentiment_data.plot(kind="barh", color="skyblue", edgecolor="black")
plt.title("Mean Sentiment Scores by Region", fontsize=14)
plt.xlabel("Mean Sentiment Score", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()

# Aspect Metrics Heatmap
plt.figure(figsize=(12, 8))
aspect_data = region_stats.xs("mean", axis=1, level=1).iloc[
    :, 1:
]  # Exclude sentiment score
sns.heatmap(aspect_data, annot=True, cmap="coolwarm", cbar_kws={"label": "Mean Value"})
plt.title("Aspect Metrics by Region", fontsize=14)
plt.xlabel("Aspects", fontsize=12)
plt.ylabel("Location", fontsize=12)
plt.tight_layout()
plt.show()

# Correlation between sentiment and aspects
correlations = region_stats.corr()["ml_sentiment_score"]["mean"]
print("Correlation between sentiment and aspects:\n", correlations)

Correlation between sentiment and aspects:
 ml_sentiment_score  mean    1.000000
                    std    -0.877118
accessibility       mean   -0.380292
accommodation       mean    0.298009
culture             mean    0.015219
food                mean    0.559874
nature              mean    0.423403
religion            mean    0.009081
safety              mean   -0.214823
Name: mean, dtype: float64

df.describe()

# Prepare data
correlation_values = correlations[1:]  # Exclude "mean" which is always 1.0
aspects = correlation_values.index.get_level_values(0)

# Plot
plt.figure(figsize=(10, 6))
correlation_values.plot(kind="barh", color="steelblue", edgecolor="black")
plt.axvline(0, color="black", linewidth=0.8, linestyle="--")
plt.title("Correlation Between Sentiment and Aspects", fontsize=14)
plt.xlabel("Correlation Coefficient", fontsize=12)
plt.ylabel("Aspects", fontsize=12)
plt.tight_layout()
plt.show()

# Print the data frame to check the changes
df.head()

	ID	location	total_review	review
0	1	Syambhunath	46 reviews	It is at the top of valleys mountain. Best pl...
1	2	Syambhunath	132 reviews	This place has a significant importance in Bud...
2	3	Syambhunath	298 reviews	Visited this from the other side on a rainy ev...
3	4	Syambhunath	247 reviews	A beautiful temple situated in the capital wit...
4	5	Syambhunath	69 reviews	great, beautiful, historic & religious place.....

	ID	location	total_review	review	cleaned_review
0	1	Syambhunath	46 reviews	It is at the top of valleys mountain. Best pl...	top valleys mountain best place get pleasure r...
1	2	Syambhunath	132 reviews	This place has a significant importance in Bud...	place significant importance buddhism visited ...
2	3	Syambhunath	298 reviews	Visited this from the other side on a rainy ev...	visited side rainy evening actually visit temp...
3	4	Syambhunath	247 reviews	A beautiful temple situated in the capital wit...	beautiful temple situated capital stunning vie...
4	5	Syambhunath	69 reviews	great, beautiful, historic & religious place.....	great beautiful historic religious place crowd...

	ID	location	total_review	review	cleaned_review	total_review_count
0	1	Syambhunath	46 reviews	It is at the top of valleys mountain. Best pl...	top valleys mountain best place get pleasure r...	46
1	2	Syambhunath	132 reviews	This place has a significant importance in Bud...	place significant importance buddhism visited ...	132
2	3	Syambhunath	298 reviews	Visited this from the other side on a rainy ev...	visited side rainy evening actually visit temp...	298
3	4	Syambhunath	247 reviews	A beautiful temple situated in the capital wit...	beautiful temple situated capital stunning vie...	247
4	5	Syambhunath	69 reviews	great, beautiful, historic & religious place.....	great beautiful historic religious place crowd...	69

	ID	location	total_review	review	cleaned_review	total_review_count	review_tokens
0	1	Syambhunath	46 reviews	It is at the top of valleys mountain. Best pl...	top valleys mountain best place get pleasure r...	46	[top, valleys, mountain, best, place, get, ple...
1	2	Syambhunath	132 reviews	This place has a significant importance in Bud...	place significant importance buddhism visited ...	132	[place, significant, importance, buddhism, vis...
2	3	Syambhunath	298 reviews	Visited this from the other side on a rainy ev...	visited side rainy evening actually visit temp...	298	[visited, side, rainy, evening, actually, visi...
3	4	Syambhunath	247 reviews	A beautiful temple situated in the capital wit...	beautiful temple situated capital stunning vie...	247	[beautiful, temple, situated, capital, stunnin...
4	5	Syambhunath	69 reviews	great, beautiful, historic & religious place.....	great beautiful historic religious place crowd...	69	[great, beautiful, historic, religious, place,...

	ID	location	total_review	review	cleaned_review	total_review_count	review_tokens	ml_sentiment	ensemble_sentiment	lexicon_sentiment	final_sentiment
0	1	Syambhunath	46 reviews	It is at the top of valleys mountain. Best pl...	top valleys mountain best place get pleasure r...	46	[top, valleys, mountain, best, place, get, ple...	2	2	0.9738	2
1	2	Syambhunath	132 reviews	This place has a significant importance in Bud...	place significant importance buddhism visited ...	132	[place, significant, importance, buddhism, vis...	2	2	0.9325	2
2	3	Syambhunath	298 reviews	Visited this from the other side on a rainy ev...	visited side rainy evening actually visit temp...	298	[visited, side, rainy, evening, actually, visi...	2	2	0.8860	2
3	4	Syambhunath	247 reviews	A beautiful temple situated in the capital wit...	beautiful temple situated capital stunning vie...	247	[beautiful, temple, situated, capital, stunnin...	2	2	0.9531	2
4	5	Syambhunath	69 reviews	great, beautiful, historic & religious place.....	great beautiful historic religious place crowd...	69	[great, beautiful, historic, religious, place,...	2	2	0.9468	2

Data Cleaning & Preprocessing¶

Exploratory Data Analysis (EDA)¶

EDA Visualization¶

ML Model Training¶

ML Model Tuning & Testing¶

Pattern Mining¶

Similarity-based Analysis¶

Network Analysis¶

Segmentation Analysis¶

Aspect Based Sentiment Analysis¶

FIN¶

	ID	location	total_review	review	cleaned_review	total_review_count	review_tokens	ml_sentiment	ensemble_sentiment	lexicon_sentiment	final_sentiment	trust_score	sentiment_polarity	ml_sentiment_score	review_class
0	1	Syambhunath	46.0	It is at the top of valleys mountain. Best pl...	top valleys mountain best place get pleasure r...	46	[top, valleys, mountain, best, place, get, ple...	2	2	0.9738	2	0.045593	0.700000	1	positive
1	2	Syambhunath	132.0	This place has a significant importance in Bud...	place significant importance buddhism visited ...	132	[place, significant, importance, buddhism, vis...	2	2	0.9325	2	0.132725	0.329167	1	positive
2	3	Syambhunath	298.0	Visited this from the other side on a rainy ev...	visited side rainy evening actually visit temp...	298	[visited, side, rainy, evening, actually, visi...	2	2	0.8860	2	0.300912	0.364583	1	positive
3	4	Syambhunath	247.0	A beautiful temple situated in the capital wit...	beautiful temple situated capital stunning vie...	247	[beautiful, temple, situated, capital, stunnin...	2	2	0.9531	2	0.249240	0.687500	1	positive
4	5	Syambhunath	69.0	great, beautiful, historic & religious place.....	great beautiful historic religious place crowd...	69	[great, beautiful, historic, religious, place,...	2	2	0.9468	2	0.068896	0.500000	1	positive

	Aspect1	Aspect2	Weight
0	accessibility	accommodation	{'weight': 9}
1	accessibility	culture	{'weight': 34}
2	accessibility	food	{'weight': 8}
3	accessibility	nature	{'weight': 46}
4	accessibility	religion	{'weight': 42}
5	accessibility	safety	{'weight': 2}
6	accommodation	culture	{'weight': 54}
7	accommodation	food	{'weight': 48}
8	accommodation	nature	{'weight': 66}
9	accommodation	religion	{'weight': 44}
10	accommodation	safety	{'weight': 3}
11	culture	food	{'weight': 63}
12	culture	nature	{'weight': 246}
13	culture	religion	{'weight': 1129}
14	culture	safety	{'weight': 9}
15	food	nature	{'weight': 112}
16	food	religion	{'weight': 44}
17	food	safety	{'weight': 4}
18	nature	religion	{'weight': 295}
19	nature	safety	{'weight': 14}
20	religion	safety	{'weight': 12}

	ml_sentiment_score	accessibility	accommodation	culture	food	nature	religion	safety
trust_segment
Very Low	0.675746	0.014916	0.033074	0.151751	0.027886	0.167315	0.136187	0.007782
Low	0.671111	0.018519	0.029630	0.215556	0.031111	0.194074	0.196296	0.008148
Medium	0.690632	0.018155	0.031954	0.227306	0.040668	0.202614	0.227306	0.008715
High	0.662429	0.026130	0.038136	0.247881	0.052966	0.199859	0.243644	0.007062
Very High	0.594213	0.026817	0.044460	0.268878	0.035992	0.204658	0.255469	0.011291

	ID	total_review	total_review_count	ml_sentiment	ensemble_sentiment	lexicon_sentiment	final_sentiment	trust_score	sentiment_polarity	ml_sentiment_score	pca1	pca2
count	7102.000000	7102.000000	7102.000000	7102.000000	7102.000000	7102.000000	7102.000000	7102.000000	7102.000000	7102.000000	7.102000e+03	7.102000e+03
mean	3631.451563	80.566742	80.566742	1.415517	1.469023	0.527836	1.418051	0.080615	0.384124	0.658828	4.001930e-18	4.802316e-17
std	2117.239644	119.807382	119.807382	0.881742	0.810323	0.347161	0.878782	0.121385	0.318223	0.502121	1.443194e-01	1.361941e-01
min	1.000000	1.000000	1.000000	-2.000000	-2.000000	-0.851900	-2.000000	0.000000	-1.000000	-1.000000	-2.904202e-01	-4.222869e-01
25%	1809.250000	13.000000	13.000000	1.000000	1.000000	0.340000	1.000000	0.012158	0.100000	0.000000	-9.846385e-02	-4.854880e-02
50%	3584.500000	38.000000	38.000000	2.000000	2.000000	0.624900	2.000000	0.037487	0.386970	1.000000	-3.502423e-02	-4.703180e-03
75%	5495.750000	94.000000	94.000000	2.000000	2.000000	0.812600	2.000000	0.094225	0.600000	1.000000	6.933070e-02	2.404595e-02
max	7271.000000	988.000000	988.000000	2.000000	2.000000	0.989500	2.000000	1.000000	1.000000	1.000000	7.903685e-01	8.742405e-01