from collections import Counter
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
import numpy as np
import pickle
import random
from scipy.spatial.distance import cosine
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
import time
import torch
from torch import optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from tqdm.auto import tqdm, trange
import wandb

random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)

/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

<torch._C.Generator at 0x301b0cdd0>

class RandomNumberGenerator:
    """
    A wrapper class for a random number generator that will (eventually) hold buffers of pre-generated random numbers for
    faster access. For now, it just calls np.random.randint and np.random.random to generate these numbers
    at the time they are needed.
    """

    def __init__(self, buffer_size, seed=12345):
        """
        Initializes the random number generator with a seed and a buffer size of random numbers to use

        Args:
            buffer_size: The number of random numbers to pre-generate. You will eventually want
                         this to be a large-enough number than you're not frequently regenerating the buffer
            seed: The seed for the random number generator
        """
        self.buffer_size = buffer_size
        self.max_val = -1

        # Create a random number generator using numpy and set its seed
        self.rng = np.random.RandomState(seed)

        # Pre-generate a buffer of random floats to use for random()
        self.float_buffer = self.rng.random(buffer_size)
        self.float_index = 0

        # Initialize the integer buffer (will be created when set_max_val is called)
        self.int_buffer = None
        self.int_index = 0

    def random(self):
        """
        Returns a random float value between 0 and 1
        """
        # Check if we need to refill the buffer
        if self.float_index >= self.buffer_size:
            self.float_buffer = self.rng.random(self.buffer_size)
            self.float_index = 0

        # Get the next random number from the buffer
        random_value = self.float_buffer[self.float_index]
        self.float_index += 1

        return random_value

    def set_max_val(self, max_val):
        """
        Sets the maximum integer value for randint and creates a buffer of random integers
        """
        self.max_val = max_val

        # Create a buffer of random integers
        self.int_buffer = self.rng.randint(0, self.max_val + 1, self.buffer_size)
        self.int_index = 0

        # Check if we need to refill the buffer
        if self.int_index >= self.buffer_size:
            self.int_buffer = self.rng.randint(0, self.max_val + 1, self.buffer_size)
            self.int_index = 0

        # Get the next random number from the buffer
        random_int = self.int_buffer[self.int_index]
        self.int_index += 1

        return random_int

    def randint(self):
        """
        Returns a random int value between 0 and self.max_val (inclusive)
        """
        if self.max_val == -1:
            raise ValueError("Need to call set_max_val before calling randint")

        # For now, just return a random integer directly
        return np.random.randint(0, self.max_val + 1)

class Corpus:
    def __init__(self, rng: RandomNumberGenerator):
        self.tokenizer = RegexpTokenizer(r"\w+")
        self.rng = rng
        self.word_to_index = {}  # word to unique-id
        self.index_to_word = {}  # unique-id to word

        # How many times each word occurs in our data after filtering
        self.word_counts = Counter()

        # A utility data structure that lets us quickly sample "negative"
        # instances in a context. This table contains unique-ids
        self.negative_sampling_table = []

        # The dataset we'll use for training, as a sequence of unique word
        # ids. This is the sequence across all documents after tokens have been
        # randomly subsampled by the word2vec preprocessing step
        self.full_token_sequence_as_ids = None

    def tokenize(self, text):
        """
        Tokenize the document and returns a list of the tokens
        """
        return self.tokenizer.tokenize(text)

    def load_data(self, file_name, min_token_freq):
        """
        Reads the data from the specified file as long long sequence of text
        (ignoring line breaks) and populates the data structures of this
        word2vec object.
        """

        # Step 1: Read in the file and create a long sequence of tokens for
        # all tokens in the file
        all_tokens = []
        print("Reading data and tokenizing")

        # Read the file
        with open(file_name, "r", encoding="utf-8") as f:
            for line in f:
                tokens = self.tokenize(line.lower())
                all_tokens.extend(tokens)

        # Step 2: Count how many tokens we have of each type
        print("Counting token frequencies")
        raw_counts = Counter(all_tokens)

        # Step 3: Replace all tokens below the specified frequency with an <UNK>
        # token.
        print("Performing minimum thresholding")
        filtered_tokens = []
        for token in all_tokens:
            if raw_counts[token] >= min_token_freq:
                filtered_tokens.append(token)
            else:
                filtered_tokens.append("<UNK>")

        # Step 4: update self.word_counts to be the number of times each word
        # occurs (including <UNK>)
        self.word_counts = Counter(filtered_tokens)

        # Step 5: Create the mappings from word to unique integer ID and the
        # reverse mapping.
        for i, word in enumerate(self.word_counts.keys()):
            self.word_to_index[word] = i
            self.index_to_word[i] = word

        # Step 6: Compute the probability of keeping any particular *token* of a
        # word in the training sequence, which we'll use to subsample. This subsampling
        # avoids having the training data be filled with many overly common words
        # as positive examples in the context

        # Calculate total number of tokens
        total_tokens = len(filtered_tokens)

        # Word2Vec subsampling formula
        # t is typically around 1e-5
        t = 1e-5
        word_to_sample_prob = {}

        for word, count in self.word_counts.items():
            # Calculate the word frequency
            freq = count / total_tokens
            # Probability to keep the word
            word_to_sample_prob[word] = (np.sqrt(freq / t) + 1) * (t / freq)
            # Ensure probability doesn't exceed 1
            word_to_sample_prob[word] = min(word_to_sample_prob[word], 1.0)

        # Step 7: process the list of tokens (after min-freq filtering) to fill
        # a new list self.full_token_sequence_as_ids where
        self.full_token_sequence_as_ids = []

        for token in filtered_tokens:
            # Perform subsampling: randomly decide whether to keep this token
            if self.rng.random() < word_to_sample_prob[token]:
                # Convert to ID and add to sequence
                self.full_token_sequence_as_ids.append(self.word_to_index[token])
        # Helpful print statement to verify what you've loaded
        print(
            "Loaded all data from %s; saw %d tokens (%d unique)"
            % (file_name, len(self.full_token_sequence_as_ids), len(self.word_to_index))
        )

    def generate_negative_sampling_table(self, exp_power=0.75, table_size=1e6):
        """
        Generates a big list data structure that we can quickly randomly index into
        in order to select a negative training example (i.e., a word that was
        *not* present in the context).
        """

        # Step 1: Figure out how many instances of each word need to go into the
        # negative sampling table.
        print("Generating sampling table")

        # Convert table_size to integer
        table_size = int(table_size)

        # Calculate the distribution with the specified power
        word_counts_powered = {}
        total_powered = 0

        for word, count in self.word_counts.items():
            if word == "<UNK>":
                continue

            word_counts_powered[word] = count**exp_power
            total_powered += word_counts_powered[word]

        # Step 2: Create the table to the correct size.
        self.negative_sampling_table = np.zeros(table_size, dtype=int)

        # Step 3: Fill the table so that each word has a number of IDs
        # proportionate to its probability of being sampled.
        index = 0
        for word, powered_count in word_counts_powered.items():
            # Calculate how many slots this word should occupy in the table
            word_id = self.word_to_index[word]
            num_slots = int((powered_count / total_powered) * table_size)

            # Fill those slots with this word's ID
            self.negative_sampling_table[index : index + num_slots] = word_id
            index += num_slots

        # If we didn't fill the entire table due to rounding, fill the rest with the last word
        if index < table_size:
            self.negative_sampling_table[index:] = self.negative_sampling_table[
                index - 1
            ]

        # Set the max value for the random number generator
        self.rng.set_max_val(table_size - 1)

    def generate_negative_samples(self, cur_context_word_id, num_samples):
        """
        Randomly samples the specified number of negative samples from the lookup
        table and returns this list of IDs as a numpy array. As a performance
        improvement, avoid sampling a negative example that has the same ID as
        the current positive context word.
        """

        results = []

        # Create a list and sample from the negative_sampling_table to
        # grow the list to num_samples, avoiding adding a negative example that
        # has the same ID as the current context_word

        while len(results) < num_samples:
            # Get a random index into the negative sampling table
            idx = self.rng.randint()

            # Get the word ID at that position in the table
            sampled_id = self.negative_sampling_table[idx]

            # Only add it if it's not the current context word
            if sampled_id != cur_context_word_id:
                results.append(sampled_id)

        return np.array(results)

rng = RandomNumberGenerator(10000)
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.tiny.txt", 2)

# Add debug prints after loading data
print(f"Vocabulary size: {len(corpus.word_to_index)}")
print(f"Number of <UNK> tokens: {corpus.word_counts.get('<UNK>', 0)}")
print(f"Most common words: {corpus.word_counts.most_common(10)}")
print(f"Sample of token sequence: {corpus.full_token_sequence_as_ids[:20]}")

# Generate negative sampling table
corpus.generate_negative_sampling_table()

# Add debug prints for negative sampling
print(f"Negative sampling table size: {len(corpus.negative_sampling_table)}")

# Test negative sampling
test_word_id = list(corpus.index_to_word.keys())[0]  # Get first word ID
neg_samples = corpus.generate_negative_samples(test_word_id, 5)
print(
    f"5 negative samples for word '{corpus.index_to_word[test_word_id]}': {neg_samples}"
)
print(f"Corresponding words: {[corpus.index_to_word[idx] for idx in neg_samples]}")

Reading data and tokenizing
Counting token frequencies
Performing minimum thresholding
Loaded all data from reviews-word2vec.tiny.txt; saw 2015 tokens (1410 unique)
Vocabulary size: 1410
Number of <UNK> tokens: 2289
Most common words: [('<UNK>', 2289), ('the', 1095), ('i', 657), ('a', 567), ('and', 540), ('to', 529), ('it', 443), ('of', 434), ('this', 402), ('book', 400)]
Sample of token sequence: [12, 27, 30, 36, 66, 67, 70, 73, 77, 81, 86, 87, 95, 11, 102, 11, 110, 112, 117, 99]
Generating sampling table
Negative sampling table size: 1000000
5 negative samples for word 'this': [ 220  187   29 1092 1326]
Corresponding words: ['author', 'people', 'i', 'eh', 'silly']

# Test with medium dataset
rng = RandomNumberGenerator(100000)  # Larger buffer size for bigger dataset
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.med.txt", 2)

# Print some statistics
print(f"Vocabulary size: {len(corpus.word_to_index)}")
print(f"Number of <UNK> tokens: {corpus.word_counts.get('<UNK>', 0)}")
print(f"Most common words: {corpus.word_counts.most_common(10)}")
print(f"Token sequence length: {len(corpus.full_token_sequence_as_ids)}")

# Generate negative sampling table and test it
print("Generating negative sampling table...")
start_time = time.time()
corpus.generate_negative_sampling_table()
sampling_time = time.time() - start_time
print(f"Negative sampling table generated in {sampling_time:.2f} seconds")

# Test negative sampling speed
print("Testing negative sampling speed...")
start_time = time.time()
for _ in range(1000):
    word_id = random.choice(list(corpus.index_to_word.keys()))
    neg_samples = corpus.generate_negative_samples(word_id, 10)
sampling_time = time.time() - start_time
print(f"1000 negative sampling operations completed in {sampling_time:.2f} seconds")

Reading data and tokenizing
Counting token frequencies
Performing minimum thresholding
Loaded all data from reviews-word2vec.med.txt; saw 2297051 tokens (52081 unique)
Vocabulary size: 52081
Number of <UNK> tokens: 49357
Most common words: [('the', 527363), ('i', 351103), ('and', 287923), ('a', 287493), ('to', 268032), ('it', 237834), ('of', 222157), ('book', 208522), ('this', 208367), ('is', 160020)]
Token sequence length: 2297051
Generating negative sampling table...
Generating sampling table
Negative sampling table generated in 0.03 seconds
Testing negative sampling speed...
1000 negative sampling operations completed in 0.19 seconds

def explore_context_examples(corpus, num_examples=5, context_size=5):
    """Explore some example contexts from the corpus"""
    # Get sequence of tokens
    token_sequence = corpus.full_token_sequence_as_ids
    sequence_length = len(token_sequence)

    print(
        f"\nExploring {num_examples} random contexts with window size {context_size}:"
    )

    for _ in range(num_examples):
        # Pick a random position in the sequence
        pos = random.randint(context_size, sequence_length - context_size - 1)

        # Get target word and its context
        target_id = token_sequence[pos]
        target_word = corpus.index_to_word[target_id]

        # Get context (words before and after the target)
        context_start = max(0, pos - context_size)
        context_end = min(sequence_length, pos + context_size + 1)

        context_ids = (
            token_sequence[context_start:pos] + token_sequence[pos + 1 : context_end]
        )
        context_words = [corpus.index_to_word[idx] for idx in context_ids]

        # Generate some negative samples
        neg_sample_ids = corpus.generate_negative_samples(target_id, 5)
        neg_sample_words = [corpus.index_to_word[idx] for idx in neg_sample_ids]

        print(f"\nTarget word: '{target_word}'")
        print(f"Context words: {context_words}")
        print(f"Negative samples: {neg_sample_words}")

# Test with tiny dataset
rng = RandomNumberGenerator(10000)
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.tiny.txt", 2)
corpus.generate_negative_sampling_table()
explore_context_examples(corpus)

Reading data and tokenizing
Counting token frequencies
Performing minimum thresholding
Loaded all data from reviews-word2vec.tiny.txt; saw 2015 tokens (1410 unique)
Generating sampling table

Exploring 5 random contexts with window size 5:

Target word: 'his'
Context words: ['genesis', 's', 'war', 'he', 'tells', 'point', 'wrong', 'oh', 'sure', 'missing']
Negative samples: ['the', 'however', 'for', 'ago', 'it']

Target word: 'developed'
Context words: ['respect', 'you', 'much', 'repetition', 'very', 'them', 'learning', 'song', 'disappointed', 'hear']
Negative samples: ['error', 'along', 'sense', 'fast', 'is']

Target word: 'apart'
Context words: ['definitely', 't', 'corny', 'took', 'unfold', 'son', 'jack', 'guy', 'against', 't']
Negative samples: ['this', 'young', 's', 'very', 's']

Target word: 'further'
Context words: ['t', 'fair', 'flat', 'inspirational', 'quotes', 'informative', 'writers', 'turns', 'after', 'at']
Negative samples: ['accept', 're', 'and', 'new', 'them']

Target word: 'needs'
Context words: ['didn', 'all', 'assume', 'already', 'humor', 'some', 'info', 'repeated', 'ask', 'know']
Negative samples: ['husband', 'two', 'not', 'our', 'the']

# Test with medium dataset
rng = RandomNumberGenerator(100000)  # Larger buffer size for bigger dataset
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.med.txt", 2)
corpus.generate_negative_sampling_table()
explore_context_examples(corpus)

Reading data and tokenizing
Counting token frequencies
Performing minimum thresholding
Loaded all data from reviews-word2vec.med.txt; saw 2297051 tokens (52081 unique)
Generating sampling table

Exploring 5 random contexts with window size 5:

Target word: 'flow'
Context words: ['only', 'bathroom', 'breaks', 'least', 'humor', 'impeccable', 'can', 'zombie', 'entertainment', 'along']
Negative samples: ['machine', 'entirely', 'barry', 'very', 'powers']

Target word: 'shack'
Context words: ['certainly', 'heck', 'jak', 'better', 'loved', 'cross', 'roads', 'eve', 'wm', 'buy']
Negative samples: ['many', 'grow', 'systematically', 's', 'to']

Target word: 'monarch'
Context words: ['mr', 'levine', 'why', 'star', 'learned', 'behavior', 'got', 'preachy', 'about', 'global']
Negative samples: ['ever', 'was', 'book', 'bogs', 'grandkids']

Target word: 'politicians'
Context words: ['marketers', 'whacky', 'politicians', 'usual', 'bent', 'even', 'average', 'wrote', 'negative', 'questions']
Negative samples: ['be', 'aware', 'another', 'mildly', 'attention']

Target word: 'jk'
Context words: ['our', 'cuckoo', 'calling', 'listen', 'of', 'rowling', 'stuck', 'potter', 'someone', 'mark']
Negative samples: ['a', 'would', 'helicopter', '3', 'qualms']

window_size = 5
num_negative_samples_per_target = 20

training_data = []

# Get the sequence of token IDs
token_sequence = corpus.full_token_sequence_as_ids
sequence_length = len(token_sequence)

# Maximum number of context words for any target word is window_size * 2
# (window_size words before and window_size words after)
max_context_size = window_size * 2

# Use tqdm for a progress bar
print(f"Generating training examples with window size {window_size}...")
for i in tqdm(range(sequence_length)):
    target_word_id = token_sequence[i]

    # Define the context window, ensuring it doesn't go out of bounds
    context_start = max(0, i - window_size)
    context_end = min(sequence_length, i + window_size + 1)

    # Get positive context words (excluding the target word itself)
    positive_context_ids = []
    for j in range(context_start, context_end):
        if j != i:  # Skip the target word itself
            context_word_id = token_sequence[j]
            if context_word_id != corpus.word_to_index.get("<UNK>", -1):
                positive_context_ids.append(context_word_id)

    # Count how many positive context words we have
    num_positive = len(positive_context_ids)

    # We need to ensure each instance has the same total size (positive + negative)
    num_negative = max_context_size - num_positive + num_negative_samples_per_target

    # Generate negative samples
    negative_context_ids = corpus.generate_negative_samples(
        target_word_id, num_negative
    )

    # Combine positive and negative context words
    all_context_ids = np.array(positive_context_ids + list(negative_context_ids))

    # Create labels (1 for positive context, 0 for negative samples)
    labels = np.array([1] * num_positive + [0] * num_negative)

    training_data.append(
        (
            np.array([target_word_id]),  # Target word ID as a numpy array
            all_context_ids,  # Context word IDs (positive and negative)
            labels,  # Labels (1 for positive, 0 for negative)
        )
    )

print(f"Generated {len(training_data)} training examples")

# Print some examples
for i in range(3):
    target_id, context_ids, labels = training_data[i]
    print(f"Example {i}:")
    print(f"  Target: {target_id}")
    print(f"  Context: {context_ids}")
    print(f"  Labels: {labels}")
    target_word = corpus.index_to_word[target_id[0]]
    context_words = [corpus.index_to_word[idx] for idx in context_ids]

    print(f"\nExample {i + 1}:")
    print(f"Target word: '{target_word}'")
    print(f"Context words: {context_words}")
    print(f"Labels: {labels}")
    print(f"Total context size: {len(context_ids)}")

Generating training examples with window size 5...

100%|██████████| 2297051/2297051 [01:07<00:00, 34022.51it/s]

Generated 2297051 training examples
Example 0:
  Target: [12]
  Context: [   27    30    34    37    44     0 16671    58   160  1587    70  2016
   165 52080   131     0   192  5320 18830   358  1738   106  6879  1789
  7216     7  4110    62    48   350]
  Labels: [1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Example 1:
Target word: 'loved'
Context words: ['with', 'am', 'familiarity', 'learned', 'concise', 'this', 'seconds', 'characters', 'all', 'either', 'plot', 'super', 'being', 'saskatoon', 'her', 'this', 'line', 'essays', 'jacobs', 'interesting', 'itself', 'time', 'avoid', 'per', 'poignant', 'the', 'rough', 'love', 'have', 'due']
Labels: [1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Total context size: 30
Example 1:
  Target: [27]
  Context: [   12    30    34    37    44    55    64  8119   390   108   222   537
 52080 18245   844    43 10523   123   803   246  1975 20942   310   374
     0  1280   408    48    95 13641]
  Labels: [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Example 2:
Target word: 'with'
Context words: ['loved', 'am', 'familiarity', 'learned', 'concise', 'buns', 'to', 'bunch', 'better', 'down', 'tell', 'own', 'saskatoon', 'ugh', 'insight', 'best', 'decor', 'when', 'free', 'away', 'laugh', 'expects', 'not', 'gives', 'this', 'engaging', 'm', 'have', 'life', 'corrected']
Labels: [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Total context size: 30
Example 2:
  Target: [30]
  Context: [   12    27    34    37    44    55    57  4659    50  4221   996   274
  3037  3440     1  9598   223     4     7  9703  8668   268 28101    12
  1374   759   163  4796  2717    12]
  Labels: [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

Example 3:
Target word: 'am'
Context words: ['loved', 'with', 'familiarity', 'learned', 'concise', 'buns', 'cast', 'virgin', 'read', 'groups', 'long', 'robin', 'needing', 'street', 'was', 'unseen', 'other', 'a', 'the', 'bookseller', 'shock', 'by', 'polo', 'loved', 'trail', 'highly', 'every', 'colleen', 'en', 'loved']
Labels: [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Total context size: 30

class Word2Vec(nn.Module):
    def __init__(self, vocab_size, embedding_size):
        super(Word2Vec, self).__init__()

        # Save state variables
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size

        # Create embedding layers for target and context words
        self.target_embeddings = nn.Embedding(vocab_size, embedding_size)
        self.context_embeddings = nn.Embedding(vocab_size, embedding_size)

        # Initialize embeddings with non-zero random values
        self.init_emb(init_range=0.5 / self.vocab_size)

        self.init_emb(init_range=0.5 / self.vocab_size)

    def init_emb(self, init_range):
        # Fill two embeddings with random numbers uniformly sampled
        # between +/- init_range
        nn.init.normal_(self.target_embeddings.weight, mean=0, std=0.1)
        nn.init.normal_(self.context_embeddings.weight, mean=0, std=0.1)

    def forward(self, target_word_id, context_word_ids):
        """
        Predicts whether each context word was actually in the context of the target word.
        The input is a tensor with a single target word's id and a tensor containing each
        of the context words' ids (this includes both positive and negative examples).
        """

        # Get embeddings
        target_emb = self.target_embeddings(target_word_id).squeeze(
            1
        )  # [batch_size, embedding_size]
        context_emb = self.context_embeddings(
            context_word_ids
        )  # [batch_size, context_size, embedding_size]

        # Reshape target for broadcasting
        target_emb = target_emb.unsqueeze(1)  # [batch_size, 1, embedding_size]

        # Return logits (no sigmoid)
        return torch.bmm(context_emb, target_emb.transpose(1, 2)).squeeze(2) * 10.0

# Create a small instance of the Word2Vec model
vocab_size = len(corpus.word_to_index)
embedding_size = 50
model = Word2Vec(vocab_size, embedding_size)

# Test with a small batch from the training data
batch_size = 3
target_word_ids = np.array([training_data[i][0] for i in range(batch_size)])
context_word_ids = np.array([training_data[i][1] for i in range(batch_size)])
labels = np.array([training_data[i][2] for i in range(batch_size)])

# Convert to PyTorch tensors
target_word_ids_tensor = torch.tensor(target_word_ids)
context_word_ids_tensor = torch.tensor(context_word_ids)
labels_tensor = torch.tensor(labels, dtype=torch.float)

# Forward pass
predictions = model(target_word_ids_tensor, context_word_ids_tensor)

# Print predictions
print("Model test:")
print(
    f"Input shape - target_word_ids: {target_word_ids_tensor.shape}, context_word_ids: {context_word_ids_tensor.shape}"
)
print(f"Output shape - predictions: {predictions.shape}")
print(f"Predictions (first example): {predictions[0]}")
print(f"Labels (first example): {labels_tensor[0]}")

# Calculate loss
loss_fn = nn.BCEWithLogitsLoss()
loss = loss_fn(predictions, labels_tensor)
print(f"Loss: {loss.item()}")

Model test:
Input shape - target_word_ids: torch.Size([3, 1]), context_word_ids: torch.Size([3, 30])
Output shape - predictions: torch.Size([3, 30])
Predictions (first example): tensor([ 0.7634,  0.6999, -0.4040, -0.9863,  0.5591, -0.3519,  0.1683,  1.1315,
         0.1566, -0.1552, -0.1615,  1.0525, -0.5134,  0.4418,  0.1089, -0.3519,
        -0.1801, -0.4140,  0.0595,  1.8004,  0.6413,  0.4321, -0.5194, -0.1192,
         0.5207, -0.1413, -0.6727,  1.1247, -0.6427, -0.1925],
       grad_fn=<SelectBackward0>)
Labels (first example): tensor([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])
Loss: 0.8155437707901001

with torch.no_grad():
    # Create a simple test case
    test_target = torch.tensor([[0]])  # Single target word
    test_context = torch.tensor([[1, 2]])  # Two context words

    # Get the embeddings
    target_emb = model.target_embeddings(test_target)
    context_emb = model.context_embeddings(test_context)

    # Print shapes and a few values
    print(f"Target embedding shape: {target_emb.shape}")
    print(f"Context embedding shape: {context_emb.shape}")
    print(f"Target embedding sample: {target_emb[0, 0, :10]}")  # First 10 values

    # Test prediction
    pred = model(test_target, test_context)
    print(f"Prediction shape: {pred.shape}")
    print(f"Predictions: {pred}")

Target embedding shape: torch.Size([1, 1, 50])
Context embedding shape: torch.Size([1, 2, 50])
Target embedding sample: tensor([-0.0499,  0.0120, -0.1029,  0.1216,  0.1093,  0.0347,  0.0603, -0.1009,
        -0.0870, -0.0142])
Prediction shape: torch.Size([1, 2])
Predictions: tensor([[ 0.0109, -1.0441]])

# Convert training data to PyTorch tensors
target_ids = np.array([example[0] for example in training_data])
context_ids = np.array([example[1] for example in training_data])
labels = np.array([example[2] for example in training_data], dtype=np.float32)

# Create PyTorch dataset
train_dataset = TensorDataset(
    torch.tensor(target_ids), torch.tensor(context_ids), torch.tensor(labels)
)

# Define batch sizes to test
batch_sizes = [2, 8, 32, 64, 128, 256, 512]
timing_results = []

# Set other hyperparameters
vocab_size = len(corpus.word_to_index)
embedding_size = 100
learning_rate = 0.001

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# For each batch size, run a small portion of training to measure performance
for batch_size in batch_sizes:
    print(f"\nTesting batch size: {batch_size}")

    # Create data loader with current batch size
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=2,
    )

    # Initialize model and optimizer
    test_model = Word2Vec(vocab_size, embedding_size).to(device)
    test_optimizer = torch.optim.AdamW(test_model.parameters(), lr=learning_rate)
    loss_function = nn.BCEWithLogitsLoss()

    # Record start time
    start_time = time.time()

    # Run a small portion of training
    test_model.train()

    # Limit steps to avoid running too long
    max_test_steps = 100

    # Use tqdm to measure speed
    progress_bar = tqdm(train_loader, desc=f"Batch size {batch_size}")

    for step, (target_ids, context_ids, labels) in enumerate(progress_bar):
        # Move data to device
        target_ids = target_ids.to(device)
        context_ids = context_ids.to(device)
        labels = labels.to(device)

        # Forward pass
        predictions = test_model(target_ids, context_ids)

        # Calculate loss
        loss = loss_function(predictions, labels)

        # Backward pass and optimize
        test_optimizer.zero_grad()
        loss.backward()
        test_optimizer.step()

        # Stop after max_test_steps
        if step >= max_test_steps:
            break

    # Calculate timing statistics
    elapsed_time = time.time() - start_time
    steps_completed = min(max_test_steps + 1, len(train_loader))
    time_per_step = elapsed_time / steps_completed
    estimated_epoch_time = time_per_step * len(train_loader)

    timing_results.append(
        {
            "batch_size": batch_size,
            "time_per_step": time_per_step,
            "estimated_epoch_time": estimated_epoch_time,
        }
    )

    print(
        f"Batch size {batch_size}: {time_per_step:.4f} sec/step, estimated epoch time: {estimated_epoch_time / 60:.2f} min"
    )

# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(
    [r["batch_size"] for r in timing_results],
    [r["estimated_epoch_time"] / 60 for r in timing_results],
    "o-",
)
plt.xscale("log", base=2)
plt.xlabel("Batch Size")
plt.ylabel("Estimated Epoch Time (minutes)")
plt.title("Impact of Batch Size on Training Time")
plt.grid(True)
plt.savefig("batch_size_timing.png")
plt.show()

# Print the results in a table format
print("\nBatch Size Comparison Results:")
print("-" * 70)
print(f"{'Batch Size':<15}{'Time per Step (s)':<20}{'Est. Epoch Time (min)':<25}")
print("-" * 70)
for result in timing_results:
    print(
        f"{result['batch_size']:<15}{result['time_per_step']:.4f}s{'':<10}{result['estimated_epoch_time'] / 60:.2f}m{'':<13}"
    )

Testing batch size: 2

Batch size 2:   0%|          | 100/1148526 [00:03<10:52:43, 29.32it/s]

Batch size 2: 0.0338 sec/step, estimated epoch time: 646.56 min

Testing batch size: 8

Batch size 8:   0%|          | 100/287132 [00:03<2:32:03, 31.46it/s]

Batch size 8: 0.0315 sec/step, estimated epoch time: 150.64 min

Testing batch size: 32

Batch size 32:   0%|          | 100/71783 [00:03<37:25, 31.92it/s]

Batch size 32: 0.0310 sec/step, estimated epoch time: 37.12 min

Testing batch size: 64

Batch size 64:   0%|          | 100/35892 [00:03<19:00, 31.37it/s]

Batch size 64: 0.0316 sec/step, estimated epoch time: 18.89 min

Testing batch size: 128

Batch size 128:   1%|          | 100/17946 [00:03<09:49, 30.27it/s]

# FINAL MODEL TRAINING
# Convert training data to PyTorch tensors
target_ids = np.array([example[0] for example in training_data])
context_ids = np.array([example[1] for example in training_data])
labels = np.array([example[2] for example in training_data], dtype=np.float32)

# Create PyTorch dataset
train_dataset = TensorDataset(
    torch.tensor(target_ids), torch.tensor(context_ids), torch.tensor(labels)
)

# Set hyperparameters
vocab_size = len(corpus.word_to_index)
embedding_size = 100
batch_size = 512
learning_rate = 0.001
epochs = 10
max_steps = None

# Initialize weights and biases
wandb.init(
    project="word2vec",
    config={
        "embedding_size": embedding_size,
        "batch_size": batch_size,
        "learning_rate": learning_rate,
        "epochs": epochs,
        "vocab_size": vocab_size,
        "dataset": "reviews-word2vec.med.txt",
    },
)

# Create data loader
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
)

# Initialize model
model = Word2Vec(vocab_size, embedding_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Initialize optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_function = nn.BCEWithLogitsLoss()

# Training loop
print(f"Training on {device}...")
start_time = time.time()
total_steps = 0

for epoch in trange(epochs, desc="Epochs"):
    model.train()
    epoch_loss = 0
    loss_sum = 0
    log_interval = 1000  # Log to wandb every 1000 steps

    current_lr = learning_rate * (1.0 - epoch / epochs)
    for param_group in optimizer.param_groups:
        param_group["lr"] = current_lr

    # Use tqdm for the inner loop too
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")

    for step, (target_ids, context_ids, labels) in enumerate(progress_bar):
        # Move data to device
        target_ids = target_ids.to(device)
        context_ids = context_ids.to(device)
        labels = labels.to(device)

        # Forward pass
        predictions = model(target_ids, context_ids)

        # Calculate loss
        loss = loss_function(predictions, labels)

        # Backward pass and optimize
        optimizer.zero_grad()
        loss.backward()

        if step % 500 == 0:  # Print every 500 batches
            # Check if gradients are flowing
            target_grad_norm = (
                model.target_embeddings.weight.grad.norm().item()
                if model.target_embeddings.weight.grad is not None
                else 0
            )
            context_grad_norm = (
                model.context_embeddings.weight.grad.norm().item()
                if model.context_embeddings.weight.grad is not None
                else 0
            )

            print(
                f"Batch {step}, Loss: {loss.item():.6f}, Target grad norm: {target_grad_norm:.6f}, Context grad norm: {context_grad_norm:.6f}"
            )
        optimizer.step()

        # Update loss statistics
        loss_value = loss.item()
        epoch_loss += loss_value
        loss_sum += loss_value

        # Update progress bar
        progress_bar.set_postfix({"Loss": f"{epoch_loss / (step + 1):.4f}"})

        # Log to wandb periodically
        if (step + 1) % log_interval == 0:
            avg_loss = loss_sum / log_interval
            wandb.log({"loss": avg_loss, "step": total_steps})
            loss_sum = 0

        total_steps += 1

        # Early stopping if needed
        if max_steps is not None and total_steps >= max_steps:
            print(f"Reached max steps ({max_steps}). Stopping early.")
            break

    # Log epoch statistics
    epoch_avg_loss = epoch_loss / len(train_loader)
    wandb.log({"epoch": epoch, "epoch_loss": epoch_avg_loss})
    print(f"Epoch {epoch + 1}/{epochs} - Avg Loss: {epoch_avg_loss:.4f}")

# Training complete
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Set model to evaluation mode
model.eval()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: axbhatta (axbhatta-university-of-michigan) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin

Training on cpu...

Epochs:   0%|          | 0/10 [00:00<?, ?it/s]

Batch 0, Loss: 0.806100, Target grad norm: 0.045808, Context grad norm: 0.044719

Batch 500, Loss: 0.657150, Target grad norm: 0.036550, Context grad norm: 0.034284

Batch 1000, Loss: 0.633538, Target grad norm: 0.031476, Context grad norm: 0.030399

Batch 1500, Loss: 0.614698, Target grad norm: 0.027889, Context grad norm: 0.027375

# Check the distribution of 1s and 0s in the labels
positive_count = 0
negative_count = 0
total_samples = 0

for i, (_, _, batch_labels) in enumerate(train_loader):
    positive_count += torch.sum(batch_labels == 1).item()
    negative_count += torch.sum(batch_labels == 0).item()
    total_samples += batch_labels.numel()

    if i >= 10:
        break

print(
    f"Positive samples: {positive_count} ({positive_count / total_samples * 100:.2f}%)"
)
print(
    f"Negative samples: {negative_count} ({negative_count / total_samples * 100:.2f}%)"
)

Positive samples: 56259 (33.30%)
Negative samples: 112701 (66.70%)

def get_neighbors(model, word_to_index, target_word):
    """
    Finds the top 10 most similar words to a target word
    """
    outputs = []
    for word, index in tqdm(word_to_index.items(), total=len(word_to_index)):
        similarity = compute_cosine_similarity(model, word_to_index, target_word, word)
        result = {"word": word, "score": similarity}
        outputs.append(result)

    # Sort by highest scores
    neighbors = sorted(outputs, key=lambda o: o["score"], reverse=True)
    return neighbors[1:11]


def compute_cosine_similarity(model, word_to_index, word_one, word_two):
    """
    Computes the cosine similarity between the two words
    """
    try:
        word_one_index = word_to_index[word_one]
        word_two_index = word_to_index[word_two]
    except KeyError:
        return 0

    embedding_one = model.target_embeddings(torch.LongTensor([word_one_index]))
    embedding_two = model.target_embeddings(torch.LongTensor([word_two_index]))
    similarity = 1 - abs(
        float(
            cosine(
                embedding_one.detach().squeeze().numpy(),
                embedding_two.detach().squeeze().numpy(),
            )
        )
    )
    return similarity

get_neighbors(model, corpus.word_to_index, "recommend")

100%|██████████| 52081/52081 [00:01<00:00, 39311.52it/s]

[{'word': 'will', 'score': 0.508688485002773},
 {'word': 'i', 'score': 0.4807530260605648},
 {'word': 'book', 'score': 0.46842723604173087},
 {'word': 'very', 'score': 0.4645072523483753},
 {'word': 'allocate', 'score': 0.4644097191411686},
 {'word': 'kally', 'score': 0.4609552543633242},
 {'word': 'well', 'score': 0.4572059902716019},
 {'word': 'read', 'score': 0.449186815111665},
 {'word': 'anyone', 'score': 0.4393127331582575},
 {'word': 'found', 'score': 0.43884326458342393}]

get_neighbors(model, corpus.word_to_index, "son")

100%|██████████| 52081/52081 [00:01<00:00, 46405.93it/s]

[{'word': 'birthday', 'score': 0.5338503158206694},
 {'word': 'loves', 'score': 0.5333835711156657},
 {'word': 'christmas', 'score': 0.49074814263593436},
 {'word': 'nephew', 'score': 0.4796583720860448},
 {'word': 'daughter', 'score': 0.46905238231735713},
 {'word': 'kids', 'score': 0.45893992897198876},
 {'word': 'gift', 'score': 0.45747387872405465},
 {'word': 'year', 'score': 0.45493666571332547},
 {'word': 'granddaughter', 'score': 0.4519668412158995},
 {'word': 'yr', 'score': 0.4457723235624296}]

get_neighbors(model, corpus.word_to_index, "daughter")

100%|██████████| 52081/52081 [00:01<00:00, 51351.20it/s]

[{'word': '14', 'score': 0.4982357717914234},
 {'word': 'bought', 'score': 0.47971988282546985},
 {'word': 'mother', 'score': 0.47688418155576473},
 {'word': 'loves', 'score': 0.46952993241347485},
 {'word': 'son', 'score': 0.46905238231735713},
 {'word': 'christmas', 'score': 0.4609921650560691},
 {'word': 'adores', 'score': 0.45518152027480596},
 {'word': 'husband', 'score': 0.4481696709518681},
 {'word': 'thompsons', 'score': 0.43712798650257856},
 {'word': 'monkeewrench', 'score': 0.4346110443484841}]

get_neighbors(model, corpus.word_to_index, "january")

100%|██████████| 52081/52081 [00:01<00:00, 50172.93it/s]

[{'word': 'aug', 'score': 0.45152324468754546},
 {'word': 'ordered', 'score': 0.4385931658743081},
 {'word': 'leviticus', 'score': 0.43006971475424893},
 {'word': 'incumbent', 'score': 0.4228366992885866},
 {'word': '26th', 'score': 0.41948427680576783},
 {'word': '2012', 'score': 0.4173595809992383},
 {'word': 'september', 'score': 0.41504268319280657},
 {'word': 'drosnin', 'score': 0.41378810080846495},
 {'word': 'premium', 'score': 0.40672489285378477},
 {'word': 'absences', 'score': 0.4061146902540256}]

get_neighbors(model, corpus.word_to_index, "war")

100%|██████████| 52081/52081 [00:01<00:00, 49337.73it/s]

[{'word': 'germany', 'score': 0.6040117444936216},
 {'word': 'grander', 'score': 0.546475414678849},
 {'word': 'civil', 'score': 0.543046725952603},
 {'word': 'nazi', 'score': 0.536052452362023},
 {'word': 'fought', 'score': 0.5294266742253091},
 {'word': 'soviet', 'score': 0.5195486700333796},
 {'word': 'german', 'score': 0.5106734724714671},
 {'word': 'jerjian', 'score': 0.5038998053315397},
 {'word': 'holocaust', 'score': 0.4974219940414639},
 {'word': 'pows', 'score': 0.48646634507127007}]

get_neighbors(model, corpus.word_to_index, "jk")

100%|██████████| 52081/52081 [00:00<00:00, 52102.15it/s]

[{'word': 'rowling', 'score': 0.6375552631884857},
 {'word': 'k', 'score': 0.47007977001794155},
 {'word': 'j', 'score': 0.4671197393647968},
 {'word': 'joyce', 'score': 0.42769347934482804},
 {'word': 'wicker', 'score': 0.40634077711243277},
 {'word': 'frommetoyouvideophoto', 'score': 0.4017203919735455},
 {'word': 'babbled', 'score': 0.3993608727789797},
 {'word': 'palahniuk', 'score': 0.39739443213606485},
 {'word': 'write', 'score': 0.3947825370544702},
 {'word': 'trey', 'score': 0.3859971729932128}]

get_neighbors(model, corpus.word_to_index, "rowling")

100%|██████████| 52081/52081 [00:00<00:00, 52089.90it/s]

[{'word': 'jk', 'score': 0.6375552631884857},
 {'word': 'j', 'score': 0.5462167512959784},
 {'word': 'k', 'score': 0.5385722940607792},
 {'word': 'fervor', 'score': 0.42660046160272325},
 {'word': 'imaginitive', 'score': 0.4006617040490874},
 {'word': 'potter', 'score': 0.39806735497340817},
 {'word': 'pendergrast', 'score': 0.396177696806864},
 {'word': 'harry', 'score': 0.3893753411141585},
 {'word': 'millworth', 'score': 0.3883774056379907},
 {'word': 'hounded', 'score': 0.3847724042334182}]

word_vectors = KeyedVectors.load_word2vec_format("word2vec_vectors.txt", binary=False)

word_vectors["the"]

array([ 2.87791230e-02, -1.61937177e-02,  7.35022426e-02,  6.08136039e-03,
       -3.04731610e-03, -2.39794236e-02, -4.13684882e-02, -4.46664579e-02,
       -2.40629409e-02,  4.12782095e-02,  5.81964757e-03,  1.57121606e-02,
       -2.37897616e-02, -7.35039497e-03,  2.28512827e-02,  2.58877855e-02,
       -3.30063999e-02,  1.68582834e-02,  6.42812625e-02,  6.48787543e-02,
       -3.89754027e-02, -5.16809598e-02,  6.98198751e-02,  3.71041112e-02,
        5.81535092e-03, -1.94813707e-03, -2.54052859e-02,  9.09739919e-03,
        3.51063162e-02, -7.23463148e-02,  6.82119057e-02,  6.98128773e-04,
        1.43062435e-02, -4.41111699e-02,  9.51339584e-03, -1.84163973e-02,
        4.06835116e-02, -7.59100243e-02, -3.26167643e-02, -4.83244881e-02,
       -2.36025602e-02, -5.39911091e-02, -3.05904578e-02,  3.84715311e-02,
       -1.55077339e-03,  1.60245933e-02,  6.13013376e-03,  1.35245062e-02,
       -2.02247291e-03,  5.10549173e-02,  1.21337287e-02, -1.83932332e-03,
        7.88010806e-02,  3.52344997e-02,  3.22021507e-02, -6.29739538e-02,
        3.65945324e-02,  3.95515338e-02, -1.80225614e-02, -1.28755085e-02,
       -1.72397643e-02, -3.64317857e-02, -1.30415997e-02,  2.06309184e-02,
        5.50987497e-02, -1.41329253e-02,  5.29036522e-02, -2.00253576e-02,
       -8.25028718e-02, -5.51599078e-02,  7.23200990e-03, -1.39763802e-02,
       -1.82269013e-03,  4.83780093e-02, -4.09766566e-03,  1.87867321e-02,
        1.79116875e-02,  1.34064425e-02, -2.15474833e-02, -6.23610057e-03,
       -4.57529761e-02,  3.90059575e-02, -3.22123617e-02,  1.03126382e-02,
       -7.80880146e-05, -3.99029143e-02,  7.74660101e-03,  4.16260809e-02,
        6.44096918e-03,  2.40974780e-02, -3.69013064e-02,  1.43709574e-02,
        1.27714090e-02, -3.74107510e-02, -1.02480752e-02, -6.52187765e-02,
       -1.75767473e-03, -6.10881746e-02,  4.84783314e-02,  4.33343463e-03],
      dtype=float32)

word_vectors["throne"]

array([-0.01244431, -0.04364225,  0.2759764 ,  0.29018155,  0.1206269 ,
       -0.18625906,  0.16937213,  0.05864361, -0.23097116,  0.06516484,
        0.01681321, -0.00542791,  0.21041824, -0.11943329, -0.22364129,
        0.2267405 , -0.19249913, -0.0747306 , -0.03923005, -0.17734562,
       -0.20579839,  0.14880508,  0.05828006,  0.08781672,  0.49324465,
        0.09436239, -0.06363599, -0.03826398, -0.07048422,  0.08627628,
        0.18379615, -0.03382628, -0.17512351, -0.02102571,  0.06627157,
        0.10030636,  0.08449201, -0.10162576,  0.02408211, -0.02161518,
        0.09332245,  0.23280422, -0.11538111,  0.2182459 , -0.29178983,
       -0.02106457,  0.11445288,  0.06116445,  0.0713018 ,  0.00575627,
       -0.11637712, -0.12888923,  0.11556633, -0.0435987 ,  0.29884002,
        0.19967335, -0.07471713,  0.11786215, -0.06983125, -0.04994857,
        0.04253665, -0.14448085,  0.02656198, -0.17770655, -0.23549373,
       -0.10039257, -0.22742842,  0.16717091, -0.29823917,  0.06449187,
       -0.15918832, -0.00850478, -0.18494481,  0.20938875, -0.04365134,
        0.32134214, -0.04512778,  0.36215946, -0.03173367,  0.1693963 ,
       -0.29261425, -0.08540694,  0.10482398,  0.00207897,  0.0103681 ,
        0.08028795,  0.13310944,  0.05764026, -0.02759366,  0.0105781 ,
        0.04982277, -0.15176205,  0.21293455, -0.1737802 ,  0.00147958,
       -0.03510624,  0.05339329, -0.22905448, -0.11946176, -0.01708566],
      dtype=float32)

def plot_word_clusters(word_groups, title, filename=None):
    """Plot word clusters using PCA dimensionality reduction"""
    # Filter to words in vocabulary
    words_to_plot = []
    for group in word_groups:
        for word in group:
            if word in word_vectors:
                words_to_plot.append(word)

    # Get vectors for all words
    vectors = [word_vectors[word] for word in words_to_plot]

    # Apply PCA to reduce to 2 dimensions
    pca = PCA(n_components=2)
    result = pca.fit_transform(vectors)

    # Create the plot
    plt.figure(figsize=(12, 8))

    # Create a colormap
    colors = plt.cm.rainbow(np.linspace(0, 1, len(word_groups)))

    # Plot each group with its own color
    start_idx = 0
    for i, group in enumerate(word_groups):
        group_words = [word for word in group if word in word_vectors]
        if not group_words:
            continue

        end_idx = start_idx + len(group_words)
        plt.scatter(
            result[start_idx:end_idx, 0],
            result[start_idx:end_idx, 1],
            c=[colors[i]] * len(group_words),
            alpha=0.6,
            label=f"Group {i + 1}",
        )

        # Add labels for each point
        for j, word in enumerate(group_words):
            plt.annotate(
                word,
                xy=(result[start_idx + j, 0], result[start_idx + j, 1]),
                fontsize=11,
            )

        start_idx = end_idx

    plt.title(title, fontsize=14)
    plt.legend(loc="upper right")
    plt.grid(alpha=0.3)

    if filename:
        plt.savefig(filename)
    plt.show()

# Print a sample of word vectors
print("Visualizing sample of word vectors with PCA")

# Create a diverse word set from different categories
words_to_visualize = [
    ["the", "a", "and", "of", "to"],  # Common function words
    ["book", "story", "novel", "read", "author"],  # Reading related
    ["good", "bad", "better", "worse", "best"],  # Evaluative terms
    ["computer", "software", "internet", "data", "program"],  # Technology
]

plot_word_clusters(
    words_to_visualize,
    "Vector Space Overview: Common Words from Different Domains",
    "pca_overview.png",
)

Visualizing sample of word vectors with PCA

word_vectors.similar_by_word("books")

[('better', 0.4869040250778198),
 ('disappointed', 0.4816638231277466),
 ('am', 0.48087063431739807),
 ('will', 0.47288861870765686),
 ('read', 0.4708085358142853),
 ('ravenloft', 0.46764233708381653),
 ('novels', 0.4672619700431824),
 ('again', 0.46715304255485535),
 ('i', 0.46709710359573364),
 ('rest', 0.46543172001838684)]

word_vectors.similar_by_word("lord")

[('god', 0.49786898493766785),
 ('ancient', 0.4958474636077881),
 ('shatters', 0.4744545519351959),
 ('behing', 0.43857118487358093),
 ('3000', 0.4337879717350006),
 ('adoration', 0.4293970763683319),
 ('tolkien', 0.42501941323280334),
 ('witham', 0.4186857044696808),
 ('koehler', 0.4060187041759491),
 ('extraordinary', 0.401309072971344)]

word_vectors.similar_by_word("computer")

[('kolmogorov', 0.49459171295166016),
 ('user', 0.46972331404685974),
 ('dummies', 0.46157950162887573),
 ('android', 0.4610218405723572),
 ('software', 0.4592110514640808),
 ('pterry', 0.45497068762779236),
 ('5c', 0.4508622884750366),
 ('ecos', 0.4497746229171753),
 ('windows', 0.44899696111679077),
 ('javascript', 0.4449140429496765)]

word_vectors.similar_by_word("love")

[('sweet', 0.5258165597915649),
 ('great', 0.5194454789161682),
 ('laughter', 0.5128576755523682),
 ('heartwarming', 0.5118382573127747),
 ('sexy', 0.5053039789199829),
 ('friendship', 0.5026050806045532),
 ('fabulous', 0.49804022908210754),
 ('smile', 0.495976060628891),
 ('hot', 0.4923068881034851),
 ('awesome', 0.48722603917121887)]

word_vectors.similar_by_word("science")

[('fiction', 0.4768356382846832),
 ('premises', 0.45717647671699524),
 ('scifi', 0.4505266845226288),
 ('ferracone', 0.4450255334377289),
 ('daniken', 0.4405204653739929),
 ('biology', 0.43340855836868286),
 ('metaphysics', 0.4285573661327362),
 ('mohan', 0.4274422824382782),
 ('mythicists', 0.4221927523612976),
 ('physics', 0.41795527935028076)]

word_vectors.similar_by_word("movie")

[('watched', 0.4802716076374054),
 ('watch', 0.4510471224784851),
 ('movies', 0.4439387619495392),
 ('film', 0.42974698543548584),
 ('essayist', 0.4279453456401825),
 ('kaiju', 0.42405009269714355),
 ('m', 0.4179936349391937),
 ('pivotal', 0.4140518009662628),
 ('devoured', 0.4108479917049408),
 ('suburbia', 0.40864497423171997)]

word_vectors.similar_by_word("university")

[('degree', 0.46053048968315125),
 ('durrell', 0.45574644207954407),
 ('distortion', 0.43417060375213623),
 ('available', 0.4339143633842468),
 ('schopenhauer', 0.43146243691444397),
 ('prestigious', 0.41894757747650146),
 ('1923', 0.41636353731155396),
 ('backwords', 0.41187968850135803),
 ('gazetteer', 0.41092348098754883),
 ('ab', 0.4093249440193176)]

word_vectors.similar_by_word("guitar")

[('songs', 0.6068133115768433),
 ('piano', 0.5396820306777954),
 ('jazz', 0.5241576433181763),
 ('music', 0.5236331224441528),
 ('pianist', 0.5159872174263),
 ('chords', 0.5124086737632751),
 ('violin', 0.500767707824707),
 ('ukulele', 0.4981173574924469),
 ('acoustic', 0.4864843785762787),
 ('bass', 0.48477089405059814)]

word_vectors.similar_by_word("galaxy")

[('postpone', 0.42973509430885315),
 ('phiona', 0.4125349521636963),
 ('qualifying', 0.4104748070240021),
 ('comic', 0.4104696214199066),
 ('readings', 0.4062885642051697),
 ('playground', 0.39435437321662903),
 ('nibiru', 0.3903523087501526),
 ('bang', 0.38781920075416565),
 ('alien', 0.3864016532897949),
 ('itis', 0.38383248448371887)]

word_vectors.similar_by_word("elephant")

[('weenies', 0.44555553793907166),
 ('miniter', 0.40624475479125977),
 ('28', 0.4032098948955536),
 ('naughty', 0.3980705738067627),
 ('crest', 0.39123108983039856),
 ('coq', 0.3900393843650818),
 ('outsider', 0.3881010413169861),
 ('turpentine', 0.3853357136249542),
 ('overnight', 0.37955185770988464),
 ('firefox', 0.3729845881462097)]

word_vectors.similar_by_word("quantum")

[('minerals', 0.4720282554626465),
 ('physics', 0.46363040804862976),
 ('mechanics', 0.44854360818862915),
 ('oop', 0.4257318079471588),
 ('tertiary', 0.4132356345653534),
 ('gendered', 0.4126375913619995),
 ('aside', 0.4117608368396759),
 ('reorientation', 0.4107493460178375),
 ('neural', 0.40954267978668213),
 ('module', 0.39912688732147217)]

# Print a sample of word similarity clusters
print("Visualizing word similarity clusters with PCA")

# Create groups from our similarity analysis
similarity_groups = [
    ["computer", "software", "user", "windows", "javascript"],  # Computer-related
    ["love", "sweet", "heartwarming", "friendship", "laughter"],  # Emotional terms
    ["guitar", "piano", "music", "violin", "songs"],  # Music-related
    ["science", "physics", "biology", "fiction", "scifi"],  # Science-related
]

plot_word_clusters(
    similarity_groups,
    "Word Similarity Clusters",
    "pca_similarities.png",
)

Visualizing word similarity clusters with PCA

def get_analogy(a, b, c):
    return word_vectors.most_similar(positive=[b, c], negative=[a])[0][0]

get_analogy("man", "woman", "king")

'chick'

get_analogy("france", "paris", "italy")

'marina'

get_analogy("good", "better", "bad")

'worse'

get_analogy("man", "father", "woman")

'her'

get_analogy("boy", "son", "dog")

'dinos'

get_analogy("walk", "walked", "hear")

'notice'

get_analogy("go", "went", "wave")

'corrupted'

get_analogy("hot", "cold", "warm")

'winter'

get_analogy("love", "hate", "goofy")

'incoherent'

get_analogy("pen", "write", "knife")

'garnishing'

# Print a sample of word analogy relationships
print("Visualizing word analogy relationships with PCA")

# Create groups from our analogy pairs
analogy_groups = [
    ["good", "better", "bad", "worse"],  # Comparative analogy
    ["man", "woman", "king", "chick"],  # Gender analogy
    ["france", "paris", "italy", "marina"],  # Geographic analogy
    ["hot", "cold", "warm", "winter"],  # Temperature analogy
]

plot_word_clusters(
    analogy_groups, "Word Analogy Relationships", "pca_analogies.png"
)

# Optional: Visualize analogy vectors with arrows
plt.figure(figsize=(12, 10))

# Get vectors and apply PCA
words = ["good", "better", "bad", "worse", "man", "woman", "king", "chick"]
vectors = [word_vectors[word] for word in words]
result = PCA(n_components=2).fit_transform(vectors)

# Plot points
plt.scatter(result[:, 0], result[:, 1], alpha=0.6)
for i, word in enumerate(words):
    plt.annotate(word, xy=(result[i, 0], result[i, 1]), fontsize=12)

# Draw analogy vectors with arrows
plt.arrow(
    result[0, 0],
    result[0, 1],
    result[1, 0] - result[0, 0],
    result[1, 1] - result[0, 1],
    head_width=0.01,
    head_length=0.01,
    fc="blue",
    ec="blue",
    label="good→better",
)
plt.arrow(
    result[2, 0],
    result[2, 1],
    result[3, 0] - result[2, 0],
    result[3, 1] - result[2, 1],
    head_width=0.01,
    head_length=0.01,
    fc="red",
    ec="red",
    label="bad→worse",
)

plt.arrow(
    result[4, 0],
    result[4, 1],
    result[5, 0] - result[4, 0],
    result[5, 1] - result[4, 1],
    head_width=0.01,
    head_length=0.01,
    fc="green",
    ec="green",
    label="man→woman",
)
plt.arrow(
    result[6, 0],
    result[6, 1],
    result[7, 0] - result[6, 0],
    result[7, 1] - result[6, 1],
    head_width=0.01,
    head_length=0.01,
    fc="purple",
    ec="purple",
    label="king→chick",
)

plt.title("Vector Directions in Analogies", fontsize=14)
plt.legend()
plt.grid(alpha=0.3)
plt.savefig("pca_analogy_vectors.png")
plt.show()

Visualizing word analogy relationships with PCA

np.random.seed(42)

# Load the word-to-index mapping
with open("word2vec_mappings.pkl", "rb") as f:
    mappings = pickle.load(f)

word_to_index = mappings["word_to_index"]
index_to_word = mappings["index_to_word"]

# Define tokenizer
tokenizer = RegexpTokenizer(r"\w+").tokenize

class DocumentAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_size, num_heads, embeddings_fname):
        """
        Creates the new classifier model. embeddings_fname is a string containing the
        filename with the saved pytorch parameters (the state dict) for the Embedding
        object that should be used to initialize this class's word Embedding parameters
        """
        super(DocumentAttentionClassifier, self).__init__()

        # Save the input arguments to the state
        self.vocab_size = vocab_size
        self.embedding_size = embedding_size
        self.num_heads = num_heads

        # Create the Embedding object that will hold our word embeddings that we
        # learned in word2vec. This embedding object should have the same size
        # as what we learned before.
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.embedding.load_state_dict(torch.load(embeddings_fname))

        # Define the attention heads using option 2 (matrix approach)
        self.attention_heads = nn.Parameter(torch.randn(num_heads, embedding_size))
        nn.init.normal_(self.attention_heads, mean=0, std=0.1)

        # Define the layer that goes from the concatenated attention heads' outputs
        # to the single output value
        self.output_layer = nn.Linear(num_heads * embedding_size, 1)

    def forward(self, word_ids):
        # Get the word embeddings for the ids
        # Shape: [batch_size, seq_length, embedding_size]
        word_embeds = self.embedding(word_ids)

        # Calculate the 'r' vectors which are the dot product of each attention head
        # with each word embedding.
        # Shape after bmm: [batch_size, num_heads, seq_length]
        r = torch.bmm(
            self.attention_heads.unsqueeze(0).expand(word_embeds.size(0), -1, -1),
            word_embeds.transpose(1, 2),
        )

        # Calculate the softmax of the 'r' vector, which call 'a'.
        # Shape: [batch_size, num_heads, seq_length]
        a = F.softmax(r, dim=2)

        # Calculate the re-weighting of the word embeddings for each head's attention
        # weight and sum the reweighted sequence for each head into a single vector.
        # Shape after bmm: [batch_size, num_heads, embedding_size]
        weighted_embeds = torch.bmm(a, word_embeds)

        # Create a single vector that has all n_heads' attention-weighted vectors
        # as one single vector.
        concat_embeds = weighted_embeds.view(word_embeds.size(0), -1)

        # Pass the side-by-side attention-weighted vectors through linear
        # layer to get some output activation.
        output = self.output_layer(concat_embeds)

        # Return the sigmoid of the output activation *and* the attention
        # weights for each head.
        return torch.sigmoid(output), a

sent_train_df = pd.read_csv("sentiment.train.csv")
sent_dev_df = pd.read_csv("sentiment.dev.csv")
sent_test_df = pd.read_csv("sentiment.test.csv")

# Print the column names for each dataset
print("Train columns:", sent_train_df.columns.tolist())
print("Dev columns:", sent_dev_df.columns.tolist())
print("Test columns:", sent_test_df.columns.tolist())

# Check a few examples from each
print("\nTrain sample:")
print(sent_train_df.head(2))
print("\nDev sample:")
print(sent_dev_df.head(2))
print("\nTest sample:")
print(sent_test_df.head(2))

Train columns: ['text', 'label']
Dev columns: ['text', 'label']
Test columns: ['inst_id', 'text']

Train sample:
                                                text  label
0  It was what I needed. There was no markings or...      1
1  A cute little book. My wife gets the family wa...      1

Dev sample:
                                                text  label
0  Picturing Perfect is a sappy love story with l...      0
1  Seems like the same story as any other series ...      0

Test sample:
   inst_id                                               text
0        0  Really sad review as I absolutely loved the fi...
1        1  Excellent content, perfect for Christians who ...

def text_to_word_ids(text, word_to_index, tokenizer):
    """Convert text to a list of word IDs using the same tokenization as word2vec"""
    tokens = tokenizer(text.lower())

    # Replace OOV tokens with <UNK> token ID
    word_ids = [word_to_index.get(token, word_to_index["<UNK>"]) for token in tokens]

    return np.array(word_ids, dtype=np.int64)


train_list = []
dev_list = []
test_list = []

# Process training data
for i, row in sent_train_df.iterrows():
    text = row["text"]
    label = row["label"]

    word_ids = text_to_word_ids(text, word_to_index, tokenizer)
    train_list.append((word_ids, np.array([label], dtype=np.float32)))

# Process dev data
for i, row in sent_dev_df.iterrows():
    text = row["text"]
    label = row["label"]

    word_ids = text_to_word_ids(text, word_to_index, tokenizer)
    dev_list.append((word_ids, np.array([label], dtype=np.float32)))

# Process test data - note that test data might not have labels
for i, row in sent_test_df.iterrows():
    text = row["text"]
    word_ids = text_to_word_ids(text, word_to_index, tokenizer)

    # Use a dummy label
    test_list.append((word_ids, np.array([0], dtype=np.float32)))

# Print the sizes of each dataset
print(f"Train: {len(train_list)} instances")
print(f"Dev: {len(dev_list)} instances")
print(f"Test: {len(test_list)} instances")

Train: 160000 instances
Dev: 20000 instances
Test: 20000 instances

def run_eval(model, eval_data):
    """
    Scores the model on the evaluation data and returns the F1
    """
    model.eval()  # Set model to evaluation mode
    true_labels = []
    predictions = []

    with torch.no_grad():
        for word_ids, label in eval_data:
            # Check if word_ids is already a tensor
            if not isinstance(word_ids, torch.Tensor):
                word_ids = torch.tensor([word_ids], dtype=torch.long)

            # Forward pass
            pred, _ = model(word_ids)

            # Convert to binary prediction (0 or 1)
            binary_pred = (pred > 0.5).float()

            # Store true label and prediction
            if isinstance(label, torch.Tensor):
                true_labels.append(label.item())
            else:
                true_labels.append(label[0])

            predictions.append(binary_pred.item())

    f1 = f1_score(np.array(true_labels), np.array(predictions))

    return f1

# Initialize model
vocab_size = len(word_to_index)
embedding_size = 100
num_heads = 4
model = DocumentAttentionClassifier(
    vocab_size, embedding_size, num_heads, "word2vec_embeddings.pt"
)

# Initialize loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)


class SentimentDataset(Dataset):
    def __init__(self, data_list):
        self.data = data_list

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        word_ids, label = self.data[idx]
        return word_ids, label


# Create datasets and dataloaders
train_dataset = SentimentDataset(train_list)
dev_dataset = SentimentDataset(dev_list)


# Use a custom collate function to handle variable-length sequences
def collate_fn(batch):
    word_ids, labels = batch[0]
    return torch.tensor([word_ids]), torch.tensor([labels])


train_loader = DataLoader(
    train_dataset, batch_size=1, shuffle=True, collate_fn=collate_fn
)
dev_loader = DataLoader(dev_dataset, batch_size=1, shuffle=False, collate_fn=collate_fn)

# Initialize weights and biases (wandb) here
wandb.init(project="document-attention-classifier")
wandb.config.update(
    {
        "learning_rate": 5e-5,
        "epochs": 1,
        "batch_size": 1,
        "embedding_size": embedding_size,
        "num_heads": num_heads,
    }
)

for epoch in range(1):
    model.train()
    loss_sum = 0
    epoch_progress = tqdm(total=len(train_loader), desc=f"Epoch {epoch + 1}")

    for step, data in enumerate(train_loader):
        word_ids, label = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        predictions, _ = model(word_ids)

        # Compute loss
        loss = criterion(predictions, label)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Accumulate loss
        loss_sum += loss.item()

        # Report loss every 500 steps to wandb and reset sum
        if (step + 1) % 500 == 0:
            avg_loss = loss_sum / 500
            wandb.log({"loss": avg_loss}, step=step)
            print(f"Step {step + 1}, Loss: {avg_loss:.4f}")
            loss_sum = 0

        # Evaluate on dev set every 5000 steps and report to wandb
        if (step + 1) % 5000 == 0:
            model.eval()
            with torch.no_grad():
                true_labels = []
                predictions = []

                for dev_word_ids, dev_label in dev_loader:
                    pred, _ = model(dev_word_ids)
                    binary_pred = (pred > 0.5).float()

                    true_labels.append(dev_label.item())
                    predictions.append(binary_pred.item())

                f1 = f1_score(true_labels, predictions)

                wandb.log({"dev_f1": f1}, step=step)
                print(f"Step {step + 1}, Dev F1: {f1:.4f}")

            # Switch back to training mode
            model.train()

        # Update progress bar
        epoch_progress.update(1)

model.eval()

/var/folders/fr/k4f4blg53d13kk91g78kslg40000gn/T/ipykernel_2258/3128810850.py:21: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  self.embedding.load_state_dict(torch.load(embeddings_fname))
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: axbhatta (axbhatta-university-of-michigan) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin

Epoch 1:   0%|          | 0/160000 [00:00<?, ?it/s]/var/folders/fr/k4f4blg53d13kk91g78kslg40000gn/T/ipykernel_2258/3337792372.py:35: UserWarning: Creating a tensor from a list of numpy.ndarrays is extremely slow. Please consider converting the list to a single numpy.ndarray with numpy.array() before converting to a tensor. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_new.cpp:281.)
  return torch.tensor([word_ids]), torch.tensor([labels])
Epoch 1:   0%|          | 514/160000 [00:04<21:56, 121.13it/s]

Step 500, Loss: 0.6921

Epoch 1:   1%|          | 1021/160000 [00:08<21:44, 121.91it/s]

Step 1000, Loss: 0.6898

Epoch 1:   1%|          | 1514/160000 [00:13<22:20, 118.25it/s]

Step 1500, Loss: 0.6871

Epoch 1:   1%|▏         | 2019/160000 [00:17<21:18, 123.53it/s]

Step 2000, Loss: 0.6852

Epoch 1:   2%|▏         | 2513/160000 [00:21<22:04, 118.90it/s]

# NOTE: This is a duplicate of the run_eval() function defined earlier
# It's repeated here for convenience in a Jupyter notebook environment
# to make this cell self-contained when running the frozen embeddings experiment
def run_eval(model, eval_data):
    """
    Scores the model on the evaluation data and returns the F1
    """
    model.eval()  # Set model to evaluation mode
    true_labels = []
    predictions = []

    with torch.no_grad():
        for word_ids, label in eval_data:
            # Check if word_ids is already a tensor
            if not isinstance(word_ids, torch.Tensor):
                word_ids = torch.tensor([word_ids], dtype=torch.long)

            # Forward pass
            pred, _ = model(word_ids)

            # Convert to binary prediction (0 or 1)
            binary_pred = (pred > 0.5).float()

            # Store true label and prediction
            if isinstance(label, torch.Tensor):
                true_labels.append(label.item())
            else:
                true_labels.append(label[0])

            predictions.append(binary_pred.item())

    f1 = f1_score(np.array(true_labels), np.array(predictions))

    return f1

# Create a new model with the same architecture
model_frozen = DocumentAttentionClassifier(
    vocab_size, embedding_size, num_heads, "word2vec_embeddings.pt"
)

# Freeze the embedding layer parameters
for param in model_frozen.embedding.parameters():
    param.requires_grad = False

# Verify embeddings are frozen
embedding_params = sum(
    p.numel() for p in model_frozen.embedding.parameters() if p.requires_grad
)
print(f"Trainable embedding parameters: {embedding_params}")
total_params = sum(p.numel() for p in model_frozen.parameters() if p.requires_grad)
print(f"Total trainable parameters: {total_params}")

# Initialize wandb for tracking the frozen model
wandb.init(project="document-attention-classifier", name="frozen-embeddings-run")

# Set up training components
criterion = nn.BCELoss()
optimizer = optim.AdamW(model_frozen.parameters(), lr=5e-5)

# Training loop for frozen embeddings model
model_frozen.train()
loss_sum = 0
step = 0
start_time = time.time()

progress_bar = tqdm(total=len(train_loader), desc="Training with frozen embeddings")

for word_ids, label in train_loader:
    # Convert to tensors if needed
    word_ids_tensor = (
        word_ids if isinstance(word_ids, torch.Tensor) else torch.tensor(word_ids)
    )
    label_tensor = label if isinstance(label, torch.Tensor) else torch.tensor(label)

    # Zero the parameter gradients
    optimizer.zero_grad()

    # Forward pass
    predictions, _ = model_frozen(word_ids_tensor)

    # Compute loss
    loss = criterion(predictions, label_tensor)

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    # Track statistics
    loss_sum += loss.item()
    step += 1

    # Report loss every 500 steps
    if step % 500 == 0:
        avg_loss = loss_sum / 500
        print(f"Step {step}, Loss: {avg_loss:.4f}")
        wandb.log({"frozen_loss": avg_loss}, step=step)
        loss_sum = 0

    # Evaluate on dev set every 5000 steps
    if step % 5000 == 0:
        f1 = run_eval(model_frozen, dev_loader)
        print(f"Step {step}, Dev F1: {f1:.4f}")
        wandb.log({"frozen_dev_f1": f1}, step=step)
        model_frozen.train()  # Return to training mode

    # Update progress bar
    progress_bar.update(1)

    # Optional: Break early for testing
    # if step >= 10000:
    #     break

# Calculate total training time
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")

# Final evaluation
final_f1 = run_eval(model_frozen, dev_loader)
print(f"Final Dev F1 (Frozen): {final_f1:.4f}")
wandb.log({"final_frozen_dev_f1": final_f1})

# Close wandb
wandb.finish()

# Save the trained model with frozen embeddings
torch.save(model_frozen.state_dict(), "frozen_attention_classifier.pt")

/var/folders/fr/k4f4blg53d13kk91g78kslg40000gn/T/ipykernel_2258/3128810850.py:21: FutureWarning: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
  self.embedding.load_state_dict(torch.load(embeddings_fname))

Trainable embedding parameters: 0
Total trainable parameters: 801

Training with frozen embeddings:   3%|▎         | 4999/160000 [01:19<41:12, 62.69it/s]

Step 500, Loss: 0.6922

Step 1000, Loss: 0.6907

Step 1500, Loss: 0.6894
Step 2000, Loss: 0.6872

Step 2500, Loss: 0.6862

# Check which model is currently active and its performance
print("Checking current model state...")
model_f1 = run_eval(model, dev_loader)
print(f"Original model F1 on dev set: {model_f1:.4f}")

try:
    frozen_f1 = run_eval(model_frozen, dev_loader)
    print(f"Frozen embeddings model F1 on dev set: {frozen_f1:.4f}")
except:
    print("Frozen embeddings model not available")

Checking current model state...
Original model F1 on dev set: 0.8935
Frozen embeddings model F1 on dev set: 0.8433

# Initialize SentimentDataset and DataLoader for the test set
test_dataset = SentimentDataset(test_list)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Make sure we're using the original (non-frozen) model for predictions
model.eval()

# Generate predictions for the test set
test_predictions = []
test_ids = []

print("Generating predictions for test data...")
with torch.no_grad():
    for i, (word_ids, _) in enumerate(tqdm(test_loader)):
        # Get the original instance ID from the test dataframe
        test_ids.append(sent_test_df.iloc[i]["inst_id"])

        # Convert to tensor if needed
        if not isinstance(word_ids, torch.Tensor):
            word_ids = torch.tensor([word_ids], dtype=torch.long)

        # Get prediction
        pred, _ = model(word_ids)

        # Convert to binary class (0 or 1)
        binary_pred = 1 if pred.item() > 0.5 else 0

        # Store prediction
        test_predictions.append(binary_pred)

        # Optional: print progress every 1000 instances
        if (i + 1) % 1000 == 0:
            print(f"Processed {i + 1}/{len(test_loader)} test instances")

# Create submission dataframe
submission_df = pd.DataFrame({"inst_id": test_ids, "label": test_predictions})

# Display first few rows to verify format
print("\nSubmission preview:")
print(submission_df.head())

# Save to CSV file for Kaggle submission
submission_file = "kaggle_submission.csv"
submission_df.to_csv(submission_file, index=False)

print(f"\nSubmission file created: {submission_file}")
print(f"Total predictions: {len(submission_df)}")
print(f"Predicted positives: {submission_df['label'].sum()}")
print(f"Predicted negatives: {len(submission_df) - submission_df['label'].sum()}")

# Display class distribution
positive_percentage = (submission_df["label"].sum() / len(submission_df)) * 100
print(
    f"Class distribution: {positive_percentage:.2f}% positive, {100 - positive_percentage:.2f}% negative"
)

Generating predictions for test data...

Processed 1000/20000 test instances
Processed 2000/20000 test instances

Processed 3000/20000 test instances

Processed 4000/20000 test instances
Processed 5000/20000 test instances
Processed 6000/20000 test instances

Processed 7000/20000 test instances
Processed 8000/20000 test instances

def get_label_and_weights(text):
    """
    Classifies the text (requires tokenizing, etc.) and returns (1) the classification label,
    (2) the tokenized words in the model's vocabulary,
    and (3) the attention weights over the in-vocab tokens as a numpy array. Note that the
    attention weights will be a matrix, depending on how many heads were used in training.
    """
    with torch.no_grad():
        # Tokenize the text
        tokens = tokenizer(text.lower())

        # Convert tokens to word IDs, handling OOV tokens
        word_ids = [
            word_to_index.get(token, word_to_index["<UNK>"]) for token in tokens
        ]

        # Convert to tensor with batch dimension
        word_ids_tensor = torch.tensor([word_ids], dtype=torch.long)

        # Get model prediction and attention weights
        prediction, attention_weights = model(word_ids_tensor)

        # Print for debugging
        print(
            f"Prediction: {prediction.item():.4f}, Label: {1 if prediction.item() > 0.5 else 0}"
        )
        print(f"Attention shape: {attention_weights.shape}")

        # Convert prediction to binary label
        label = 1 if prediction.item() > 0.5 else 0

        # Convert attention weights to numpy
        # Shape: [num_heads, sequence_length]
        attention_np = attention_weights[0, :, : len(tokens)].numpy()

        return label, tokens, attention_np

def visualize_attention(words, attention_weights, max_words=50):
    """
    Makes a heatmap figure that visualizes the attention weights for an item.
    Attention weights should be a numpy array that has the shape (num_heads, num_words)

    Parameters:
    - words: List of tokens/words
    - attention_weights: Numpy array of attention weights
    - max_words: Maximum number of words to display (for very long texts)
    """
    # If text is too long, truncate it for visualization
    if len(words) > max_words:
        words = words[:max_words]
        attention_weights = attention_weights[:, :max_words]

    # Calculate appropriate figure dimensions
    word_length = sum(len(w) for w in words) / len(words)  # Average word length
    fig_width = max(10, min(20, len(words) * 0.3 * word_length))
    fig_height = max(4, attention_weights.shape[0] * 0.5)

    # Create figure with appropriate size
    fig, ax = plt.subplots(figsize=(fig_width, fig_height))

    # Create heatmap
    im = ax.imshow(attention_weights, aspect="auto", cmap="viridis")

    # Set up axes
    ax.set_yticks(np.arange(attention_weights.shape[0]))
    ax.set_xticks(np.arange(len(words)))

    # Add labels
    ax.set_yticklabels([f"Head {i}" for i in range(attention_weights.shape[0])])
    ax.set_xticklabels(words)

    # Rotate the tick labels and set alignment
    plt.setp(
        ax.get_xticklabels(),
        rotation=45,
        ha="right",
        rotation_mode="anchor",
        fontsize=8,
    )

    # Add axis labels
    ax.set_ylabel("Attention Head")

    # Add colorbar
    cbar = fig.colorbar(im, ax=ax)
    cbar.set_label("Probability")

    # Add grid lines
    ax.set_xticks(np.arange(-0.5, len(words), 1), minor=True)
    ax.set_yticks(np.arange(-0.5, attention_weights.shape[0], 1), minor=True)
    ax.grid(which="minor", color="w", linestyle="-", linewidth=1)

    # Improve layout with extra padding
    plt.tight_layout(pad=2.0)
    plt.show()

    # For very long texts, print a message
    if len(words) > max_words:
        print(f"Note: Text truncated to {max_words} words for visualization clarity")

s = "Just as I remembered it, one of my favorites from childhood! Great condition, very happy to have this to share with my daughter. Packaging was so nice and was received quickly."
pred, tokens, attn = get_label_and_weights(s)
print(f"Prediction: {'Positive' if pred == 1 else 'Negative'}")
visualize_attention(tokens, attn)

Prediction: 0.9844, Label: 1
Attention shape: torch.Size([1, 4, 31])
Prediction: Positive

s = """
I'm a big fan of his, and I have to say that this was a BIG letdown. It features: Stilted dialogue, no character development, no suspense, no description of Indian tradition and poor editing.\n\nAvoid at all costs.
"""
pred, tokens, attn = get_label_and_weights(s)
print(f"Prediction: {'Positive' if pred == 1 else 'Negative'}")
visualize_attention(tokens, attn)

Prediction: 0.0026, Label: 0
Attention shape: torch.Size([1, 4, 39])
Prediction: Negative

# Sample positive examples from dev set
pos_example1 = sent_dev_df[sent_dev_df["label"] == 1].iloc[10]["text"]
pos_example2 = sent_dev_df[sent_dev_df["label"] == 1].iloc[15]["text"]

# Sample negative examples from dev set
neg_example1 = sent_dev_df[sent_dev_df["label"] == 0].iloc[20]["text"]
neg_example2 = sent_dev_df[sent_dev_df["label"] == 0].iloc[30]["text"]

# Visualize the examples
for i, (text, label_name) in enumerate(
    [
        (pos_example1, "Positive Heatmap 1"),
        (pos_example2, "Positive Heatmap 2"),
        (neg_example1, "Negative Heatmap 1"),
        (neg_example2, "Negative Heatmap 2"),
    ]
):
    print(f"\n{label_name}")
    print(f"Text: {text[:100]}..." if len(text) > 100 else f"Text: {text}")
    pred, tokens, attn = get_label_and_weights(text)
    print(f"Prediction: {'Positive' if pred == 1 else 'Negative'}")
    visualize_attention(tokens, attn)

Positive Heatmap 1
Text: Rollicking tales of what once was, and timeless examples of who we should still be. ...
Prediction: 0.9877, Label: 1
Attention shape: torch.Size([1, 4, 15])
Prediction: Positive

Positive Heatmap 2
Text: I just got this book a few days ago and I love it. I have been sewing for years, but I still like to...
Prediction: 0.9753, Label: 1
Attention shape: torch.Size([1, 4, 73])
Prediction: Positive

Negative Heatmap 1
Text: Some good tech. info. but the books drags and drags and drags. Not well written. Could have provided...
Prediction: 0.1685, Label: 0
Attention shape: torch.Size([1, 4, 24])
Prediction: Negative

Negative Heatmap 2
Text: Was not into it I guess. Characters could have been more. Some characters shouldn't have been at all...
Prediction: 0.0050, Label: 0
Attention shape: torch.Size([1, 4, 30])
Prediction: Negative

# Trying to fool the classifier with ambiguous or mixed sentiment
tricky_examples = [
    "This book was absolutely terrible but I couldn't put it down.",
    "Not the worst product I've ever used, which isn't saying much.",
    "While I hated every moment reading it, I have to admit it was well-written.",
    "The story was predictable and boring, but somehow it kept me engaged until the end.",
    "I wouldn't recommend this to my friends, but I don't regret buying it.",
]

for i, text in enumerate(tricky_examples):
    print(f"\nTricky Example {i + 1}:")
    print(f"Text: {text}")
    pred, tokens, attn = get_label_and_weights(text)
    print(f"Prediction: {pred:.4f}, Label: {1 if pred > 0.5 else 0}")
    print(f"Prediction: {'Positive' if pred > 0.5 else 'Negative'}")
    visualize_attention(tokens, attn)

Tricky Example 1:
Text: This book was absolutely terrible but I couldn't put it down.
Prediction: 0.0221, Label: 0
Attention shape: torch.Size([1, 4, 12])
Prediction: 0.0000, Label: 0
Prediction: Negative

Tricky Example 2:
Text: Not the worst product I've ever used, which isn't saying much.
Prediction: 0.0000, Label: 0
Attention shape: torch.Size([1, 4, 13])
Prediction: 0.0000, Label: 0
Prediction: Negative

Tricky Example 3:
Text: While I hated every moment reading it, I have to admit it was well-written.
Prediction: 0.7712, Label: 1
Attention shape: torch.Size([1, 4, 15])
Prediction: 1.0000, Label: 1
Prediction: Positive

Tricky Example 4:
Text: The story was predictable and boring, but somehow it kept me engaged until the end.
Prediction: 0.0096, Label: 0
Attention shape: torch.Size([1, 4, 15])
Prediction: 0.0000, Label: 0
Prediction: Negative

Tricky Example 5:
Text: I wouldn't recommend this to my friends, but I don't regret buying it.
Prediction: 0.1061, Label: 0
Attention shape: torch.Size([1, 4, 15])
Prediction: 0.0000, Label: 0
Prediction: Negative

Create an efficient random number generator¶

Create a class to hold the data¶

Create the corpus¶

Corpus Processing Insights¶

Context Exploration Insights¶

Generate the training data¶

Training Data Generation Insights¶

Create the network¶

Model Testing Insights¶

Train the network¶

Batch Size Analysis Insights¶

Training Performance Insights¶

Training Data Balance Insights¶

Verify things are working¶

Word Similarity Insights¶

Vector Access Insights¶

Word Vector Space Organization¶

Word Similarity Analysis¶

Strengths and Limitations¶

Word Analogy Insights¶

Load in the necessary parameters from the word2vec code¶

Define the Classifier Model¶

Load in the datasets¶

Dataset Structure Insights¶

Build the code training loop¶

Training Parameters Insights¶

Run history:

Run summary:

Frozen vs. Trainable Embeddings Insights¶

Kaggle Performance Insights¶

Inspecting what the model learned¶

Helper functions for visualization¶

Initial Attention Visualization Insights¶

Sentiment Analysis on Dev Samples Insights¶

Challenging the Model with Mixed Sentiment Insights¶

dev_f1	▁▂▃▃▄▅▅▆▆▆▇▇▇▇▇█████████████████
final_frozen_dev_f1	▁
frozen_dev_f1	▁
frozen_loss	▁
loss	██▅▅▄▃▃▂▃▃▄▃▃▂▂▂▁▂▂▂▂▂▁▁▂▂▂▂▂▁▁▂▁▂▂▂▁▂▂▂

dev_f1	0.89352
final_frozen_dev_f1	0.84326
frozen_dev_f1	0.84326
frozen_loss	0.4409
loss	0.28389