* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
Models of Language and Conversation

Lecture 23: Implementing GPT from Scratch

Building a Language Model in PyTorch 💻

Models of Language and Conversation

Week 7

Week 7
Models of Language and Conversation

Today's Journey 🗺️

Week 7
Models of Language and Conversation

What We'll Build Today 🏗️

<div class="callout warning">
Goal

Implement a simplified GPT model from scratch in PyTorch!

**Components we'll cover:**
1. 🔤 Tokenization (Byte-Pair Encoding)
  1. 📦 Embeddings (token + position)

  2. 👁️ Masked Multi-Head Attention

  3. 🧠 Transformer Decoder Block

  4. 🎯 Language Model Head

  5. 🎓 Training Loop

  6. 🎲 Text Generation

Think about it!

We'll build a "nano-GPT"—small enough to train on a laptop, but with the same architecture as the real thing!

Week 7
Models of Language and Conversation

Required Libraries 📚


1import torch
2import torch.nn as nn
3import torch.nn.functional as F
4from torch.utils.data import Dataset, DataLoader
5import numpy as np
6import tiktoken  # OpenAI's BPE tokenizer
7
8# Check for GPU
9device = 'cuda' if torch.cuda.is_available() else 'cpu'
10print(f"Using device: {device}")
11
12# Hyperparameters
13config = {
14    'vocab_size': 50257,      # GPT-2 vocabulary size
15    'd_model': 384,            # Embedding dimension
16    'n_layers': 6,             # Number of transformer blocks
17    'n_heads': 6,              # Number of attention heads
18    'max_seq_len': 256,        # Maximum sequence length
19    'dropout': 0.1,
20    'batch_size': 32,
continued...
Week 7
Models of Language and Conversation

Required Libraries 📚


21    'learning_rate': 3e-4,
22    'num_epochs': 10
23}
...continued

Install: pip install torch tiktoken

Week 7
Models of Language and Conversation

What is Tokenization? 🔤

<div class="callout info">
Tokenization

Converting text into a sequence of integer IDs that the model can process.

**Common strategies:**
1. **Word-level**: Split on spaces
    - ❌ Huge vocabulary, can't handle unknown words
  1. Character-level: Individual characters
    - ❌ Very long sequences, loses word structure
  2. Subword-level (BPE): Best of both worlds! ✅
    - Frequent words: one token
  • Rare words: multiple subword tokens
  • Can handle any word via character fallback
Week 7
Models of Language and Conversation

Byte-Pair Encoding (BPE) 🧩

**How BPE works:**



1. Start with characters as base vocabulary
  1. Find most frequent pair of adjacent tokens

  2. Merge this pair into a new token

  3. Repeat until desired vocabulary size

Example
    **Text:** "low low low lower lower newest newest"

    

    **Iterations:**
    1. Merge "l" + "o" → "lo"
  1. Merge "lo" + "w" → "low"

  2. Merge "low" + "e" + "r" → "lower"

  3. Merge "n" + "e" + "w" + "e" + "s" + "t" → "newest"

     **Vocabulary:** [l, o, w, e, r, n, s, t, lo, low, lower, newest, ...]
    
Week 7
Models of Language and Conversation

BPE Tokenization in Code 💻


1import tiktoken
2
3# Load GPT-2 tokenizer (uses BPE)
4tokenizer = tiktoken.get_encoding("gpt2")
5
6# Example text
7text = "Hello, how are you doing today?"
8
9# Encode: text -> token IDs
10tokens = tokenizer.encode(text)
11print("Tokens:", tokens)
12# Output: [15496, 11, 703, 389, 345, 1804, 1909, 30]
13
14# Decode: token IDs -> text
15decoded = tokenizer.decode(tokens)
16print("Decoded:", decoded)
17# Output: "Hello, how are you doing today?"
18
19# See individual token strings
20for token_id in tokens:
continued...
Week 7
Models of Language and Conversation

BPE Tokenization in Code 💻


21    token_str = tokenizer.decode([token_id])
22    print(f"{token_id}: '{token_str}'")
23
24# Vocabulary size
25print(f"Vocab size: {tokenizer.n_vocab}")  # 50257
...continued
Week 7
Models of Language and Conversation

Creating a Text Dataset 📚


1class TextDataset(Dataset):
2    def __init__(self, text_file, tokenizer, max_seq_len):
3        # Load text
4        with open(text_file, 'r', encoding='utf-8') as f:
5            text = f.read()
6
7        # Tokenize entire text
8        self.tokens = tokenizer.encode(text)
9        self.max_seq_len = max_seq_len
10
11    def __len__(self):
12        # Number of sequences we can extract
13        return len(self.tokens) - self.max_seq_len
14
15    def __getitem__(self, idx):
16        # Get sequence of length max_seq_len + 1
17        # (we need +1 for the target)
18        chunk = self.tokens[idx:idx + self.max_seq_len + 1]
19
20        # Input: all but last token
continued...
Week 7
Models of Language and Conversation

Creating a Text Dataset 📚


21        x = torch.tensor(chunk[:-1], dtype=torch.long)
22        # Target: all but first token
23        y = torch.tensor(chunk[1:], dtype=torch.long)
24
25        return x, y
26
27# Usage
28dataset = TextDataset('shakespeare.txt', tokenizer, config['max_seq_len'])
29dataloader = DataLoader(dataset, batch_size=config['batch_size'], shuffle=True)
...continued
Week 7
Models of Language and Conversation

Token and Position Embeddings 📦


1class Embeddings(nn.Module):
2    def __init__(self, vocab_size, d_model, max_seq_len, dropout):
3        super().__init__()
4        # Token embeddings: map token IDs to vectors
5        self.token_embed = nn.Embedding(vocab_size, d_model)
6
7        # Position embeddings: encode position information
8        self.pos_embed = nn.Embedding(max_seq_len, d_model)
9
10        self.dropout = nn.Dropout(dropout)
11        self.d_model = d_model
12
13    def forward(self, x):
14        # x shape: (batch_size, seq_len)
15        seq_len = x.size(1)
16
17        # Token embeddings
18        tok_emb = self.token_embed(x)  # (batch, seq_len, d_model)
19
20        # Position embeddings
continued...
Week 7
Models of Language and Conversation

Token and Position Embeddings 📦


21        positions = torch.arange(0, seq_len, device=x.device)
22        pos_emb = self.pos_embed(positions)  # (seq_len, d_model)
23
24        # Combine (broadcasting handles batch dimension)
25        embeddings = tok_emb + pos_emb
26
27        return self.dropout(embeddings)
...continued
Week 7
Models of Language and Conversation

Masked Multi-Head Attention 👁️


1class MultiHeadAttention(nn.Module):
2    def __init__(self, d_model, n_heads, dropout):
3        super().__init__()
4        assert d_model % n_heads == 0
5
6        self.d_model = d_model
7        self.n_heads = n_heads
8        self.head_dim = d_model // n_heads
9
10        # Linear layers for Q, K, V
11        self.q_linear = nn.Linear(d_model, d_model)
12        self.k_linear = nn.Linear(d_model, d_model)
13        self.v_linear = nn.Linear(d_model, d_model)
14
15        # Output projection
16        self.out_linear = nn.Linear(d_model, d_model)
17        self.dropout = nn.Dropout(dropout)
18
19    def forward(self, x, mask=None):
20        batch_size, seq_len, d_model = x.shape
continued...
Week 7
Models of Language and Conversation

Masked Multi-Head Attention 👁️


21
22        # Linear projections and split into heads
23        Q = self.q_linear(x).view(batch_size, seq_len, self.n_heads, self.head_dim)
24        K = self.k_linear(x).view(batch_size, seq_len, self.n_heads, self.head_dim)
25        V = self.v_linear(x).view(batch_size, seq_len, self.n_heads, self.head_dim)
26
27        # Transpose for attention: (batch, n_heads, seq_len, head_dim)
28        Q = Q.transpose(1, 2)
29        K = K.transpose(1, 2)
30        V = V.transpose(1, 2)
...continued
Week 7
Models of Language and Conversation

Attention Computation (cont.) 🧮


1# Scaled dot-product attention
2        # Scores: (batch, n_heads, seq_len, seq_len)
3        scores = torch.matmul(Q, K.transpose(-2, -1)) / np.sqrt(self.head_dim)
4
5        # Apply causal mask (prevent attending to future tokens)
6        if mask is not None:
7            scores = scores.masked_fill(mask == 0, float('-inf'))
8
9        # Softmax to get attention weights
10        attn_weights = F.softmax(scores, dim=-1)
11        attn_weights = self.dropout(attn_weights)
12
13        # Apply attention to values
14        # Output: (batch, n_heads, seq_len, head_dim)
15        attn_output = torch.matmul(attn_weights, V)
16
17        # Concatenate heads
18        attn_output = attn_output.transpose(1, 2).contiguous()
19        attn_output = attn_output.view(batch_size, seq_len, d_model)
20
continued...
Week 7
Models of Language and Conversation

Attention Computation (cont.) 🧮


21        # Final linear projection
22        output = self.out_linear(attn_output)
23
24        return output
...continued
Week 7
Models of Language and Conversation

Creating the Causal Mask 🎭


1def create_causal_mask(seq_len, device):
2    """
3    Create a causal (lower-triangular) mask for autoregressive generation.
4
5    Returns:
6        mask: (seq_len, seq_len) with 1s on and below diagonal, 0s above
7    """
8    mask = torch.tril(torch.ones(seq_len, seq_len, device=device))
9    return mask  # Shape: (seq_len, seq_len)
10
11# Example: 5x5 causal mask
12mask = create_causal_mask(5, 'cpu')
13print(mask)
14# tensor([[1., 0., 0., 0., 0.],
15#         [1., 1., 0., 0., 0.],
16#         [1., 1., 1., 0., 0.],
17#         [1., 1., 1., 1., 0.],
18#         [1., 1., 1., 1., 1.]])

Key insight: Each position can only attend to itself and previous positions!

Week 7
Models of Language and Conversation

Feed-Forward Network 🔄


1class FeedForward(nn.Module):
2    def __init__(self, d_model, dropout):
3        super().__init__()
4        # GPT uses 4 * d_model as the hidden dimension
5        self.net = nn.Sequential(
6            nn.Linear(d_model, 4 * d_model),
7            nn.GELU(),  # GPT uses GELU activation
8            nn.Linear(4 * d_model, d_model),
9            nn.Dropout(dropout)
10        )
11
12    def forward(self, x):
13        return self.net(x)

Why GELU (Gaussian Error Linear Unit)?

  • Smooth, non-monotonic activation
  • Used in BERT and GPT
  • Slight improvement over ReLU for language models

Week 7
Models of Language and Conversation

Transformer Decoder Block 🧱


1class TransformerBlock(nn.Module):
2    def __init__(self, d_model, n_heads, dropout):
3        super().__init__()
4        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
5        self.feed_forward = FeedForward(d_model, dropout)
6
7        # Layer normalization (applied before sub-layers in GPT)
8        self.ln1 = nn.LayerNorm(d_model)
9        self.ln2 = nn.LayerNorm(d_model)
10
11    def forward(self, x, mask):
12        # Pre-norm architecture (used in GPT)
13        # Attention with residual connection
14        x = x + self.attention(self.ln1(x), mask)
15
16        # Feed-forward with residual connection
17        x = x + self.feed_forward(self.ln2(x))
18
19        return x

Note: GPT uses pre-norm (LayerNorm before sub-layers), while original Transformer used post-norm.

Week 7
Models of Language and Conversation

Complete GPT Model 🤖


1class GPT(nn.Module):
2    def __init__(self, vocab_size, d_model, n_layers, n_heads, max_seq_len, dropout):
3        super().__init__()
4        self.max_seq_len = max_seq_len
5
6        # Embeddings
7        self.embeddings = Embeddings(vocab_size, d_model, max_seq_len, dropout)
8
9        # Transformer blocks
10        self.blocks = nn.ModuleList([
11            TransformerBlock(d_model, n_heads, dropout)
12            for _ in range(n_layers)
13        ])
14
15        # Final layer norm
16        self.ln_f = nn.LayerNorm(d_model)
17
18        # Language model head (projects to vocabulary)
19        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)
20
continued...
Week 7
Models of Language and Conversation

Complete GPT Model 🤖


21        # Initialize weights
22        self.apply(self._init_weights)
23
24    def _init_weights(self, module):
25        if isinstance(module, nn.Linear):
26            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
27            if module.bias is not None:
28                torch.nn.init.zeros_(module.bias)
29        elif isinstance(module, nn.Embedding):
30            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
...continued
Week 7
Models of Language and Conversation

GPT Forward Pass 🔜


1def forward(self, x, targets=None):
2        # x shape: (batch_size, seq_len)
3        seq_len = x.size(1)
4
5        # Create causal mask
6        mask = create_causal_mask(seq_len, x.device)
7
8        # Embeddings
9        x = self.embeddings(x)  # (batch, seq_len, d_model)
10
11        # Apply transformer blocks
12        for block in self.blocks:
13            x = block(x, mask)
14
15        # Final layer norm
16        x = self.ln_f(x)
17
18        # Project to vocabulary
19        logits = self.lm_head(x)  # (batch, seq_len, vocab_size)
20
continued...
Week 7
Models of Language and Conversation

GPT Forward Pass 🔜


21        # Compute loss if targets provided
22        loss = None
23        if targets is not None:
24            # Flatten for cross-entropy
25            loss = F.cross_entropy(
26                logits.view(-1, logits.size(-1)),
27                targets.view(-1)
28            )
29
30        return logits, loss
...continued
Week 7
Models of Language and Conversation

Training Loop 🎓


1# Initialize model
2model = GPT(
3    vocab_size=config['vocab_size'],
4    d_model=config['d_model'],
5    n_layers=config['n_layers'],
6    n_heads=config['n_heads'],
7    max_seq_len=config['max_seq_len'],
8    dropout=config['dropout']
9).to(device)
10
11# Optimizer (AdamW is used for GPT)
12optimizer = torch.optim.AdamW(
13    model.parameters(),
14    lr=config['learning_rate'],
15    betas=(0.9, 0.95),
16    weight_decay=0.1
17)
18
19# Training loop
20model.train()
continued...
Week 7
Models of Language and Conversation

Training Loop 🎓


21for epoch in range(config['num_epochs']):
22    total_loss = 0
23    for batch_idx, (x, y) in enumerate(dataloader):
24        x, y = x.to(device), y.to(device)
25
26        # Forward pass
27        logits, loss = model(x, targets=y)
28
29        # Backward pass
30        optimizer.zero_grad()
31        loss.backward()
32        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
33        optimizer.step()
34
35        total_loss += loss.item()
36
37    avg_loss = total_loss / len(dataloader)
38    print(f"Epoch {epoch+1}/{config['num_epochs']}, Loss: {avg_loss:.4f}")
...continued
Week 7
Models of Language and Conversation

Training Tips 💡

**Best practices for training GPT:**



1. **Gradient clipping**
    - Prevents exploding gradients
  • Clip to max norm of 1.0
  1. Learning rate schedule
    - Warmup for first few thousand steps
  • Cosine decay afterwards
  1. AdamW optimizer
    - Adam with decoupled weight decay
  • Better than standard Adam for transformers
  1. Batch size
    - Larger is better (up to memory limits)
  • Use gradient accumulation if needed
  1. Mixed precision training
    - Use float16 for speed
  • 2-3x faster on modern GPUs
Week 7
Models of Language and Conversation

Greedy Decoding 🎯


1@torch.no_grad()
2def generate_greedy(model, tokenizer, prompt, max_new_tokens=50):
3    model.eval()
4
5    # Encode prompt
6    tokens = tokenizer.encode(prompt)
7    x = torch.tensor([tokens], dtype=torch.long, device=device)
8
9    for _ in range(max_new_tokens):
10        # Get predictions (crop to max_seq_len if needed)
11        x_crop = x[:, -model.max_seq_len:]
12        logits, _ = model(x_crop)
13
14        # Focus on last token's predictions
15        logits = logits[:, -1, :]  # (batch, vocab_size)
16
17        # Get token with highest probability
18        next_token = torch.argmax(logits, dim=-1, keepdim=True)
19
20        # Append to sequence
continued...
Week 7
Models of Language and Conversation

Greedy Decoding 🎯


21        x = torch.cat([x, next_token], dim=1)
22
23        # Stop if we generate end-of-sequence token
24        if next_token.item() == tokenizer.eot_token:
25            break
26
27    # Decode and return
28    generated_text = tokenizer.decode(x[0].tolist())
29    return generated_text
30
31# Example usage
32prompt = "Once upon a time"
33generated = generate_greedy(model, tokenizer, prompt, max_new_tokens=100)
34print(generated)
...continued
Week 7
Models of Language and Conversation

Sampling Strategies 🎲

**Different ways to sample next token:**



1. **Greedy Decoding**
    - Always pick most likely token
  • ✅ Deterministic, fast
  • ❌ Repetitive, boring
  1. Temperature Sampling
    - Scale logits by temperature
  • : More conservative (peaked distribution)
  • : More random (flat distribution)
  1. Top-k Sampling
    - Sample from top most likely tokens
  • Typical:
  1. Nucleus (Top-p) Sampling
    - Sample from smallest set with cumulative probability
  • Typical: or
  • ✅ Best for creative generation
Week 7
Models of Language and Conversation

Top-k and Nucleus Sampling 🎰


1def sample_next_token(logits, temperature=1.0, top_k=None, top_p=None):
2    """
3    Sample next token from logits with various strategies.
4
5    Args:
6        logits: (vocab_size,) unnormalized log probabilities
7        temperature: Temperature for sampling
8        top_k: If set, only sample from top k tokens
9        top_p: If set, only sample from nucleus (top-p)
10    """
11    # Apply temperature
12    logits = logits / temperature
13
14    # Top-k filtering
15    if top_k is not None:
16        top_k = min(top_k, logits.size(-1))
17        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
18        logits[indices_to_remove] = float('-inf')
19
20    # Nucleus (top-p) filtering
continued...
Week 7
Models of Language and Conversation

Top-k and Nucleus Sampling 🎰


21    if top_p is not None:
22        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
23        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
24
25        # Remove tokens with cumulative probability above threshold
26        sorted_indices_to_remove = cumulative_probs > top_p
27        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
28        sorted_indices_to_remove[..., 0] = 0
29
30        indices_to_remove = sorted_indices[sorted_indices_to_remove]
31        logits[indices_to_remove] = float('-inf')
32
33    # Sample from distribution
34    probs = F.softmax(logits, dim=-1)
35    next_token = torch.multinomial(probs, num_samples=1)
36
37    return next_token
...continued
Week 7
Models of Language and Conversation

Complete Generation Function 🎨


1@torch.no_grad()
2def generate(model, tokenizer, prompt, max_new_tokens=100,
3             temperature=1.0, top_k=40, top_p=0.9):
4    model.eval()
5
6    tokens = tokenizer.encode(prompt)
7    x = torch.tensor([tokens], dtype=torch.long, device=device)
8
9    for _ in range(max_new_tokens):
10        x_crop = x[:, -model.max_seq_len:]
11        logits, _ = model(x_crop)
12        logits = logits[:, -1, :]  # Last token
13
14        # Sample next token
15        next_token = sample_next_token(
16            logits[0],
17            temperature=temperature,
18            top_k=top_k,
19            top_p=top_p
20        )
continued...
Week 7
Models of Language and Conversation

Complete Generation Function 🎨


21
22        x = torch.cat([x, next_token.unsqueeze(0)], dim=1)
23
24        if next_token.item() == tokenizer.eot_token:
25            break
26
27    return tokenizer.decode(x[0].tolist())
28
29# Creative generation
30text = generate(model, tokenizer, "The AI revolution",
31                temperature=0.8, top_p=0.9)
32print(text)
...continued
Week 7
Models of Language and Conversation

Model Size vs. Performance 📊


1 10M ->  100M ->  1B ->  10B ->  100B+
**Practical model sizes for different use cases:**
- **10M-100M**: Learning/experimentation, simple tasks
  • 100M-1B: Specialized domains, resource-constrained
  • 1B-10B: General-purpose, good quality
  • 10B-100B+: State-of-the-art performance
Week 7
Models of Language and Conversation

Computational Requirements 💻

**Training a 125M parameter GPT:**




    | Training Time | 1-2 days (single GPU) |

| --- | --- |
| Training Data | $\sim$10-100 GB text |
| Total Compute | $\sim$100 GPU-hours |

**Scaling up to GPT-3 (175B):**
- 1000x more parameters
  • ~10,000x more compute needed

  • Requires distributed training across many GPUs

  • Estimated $4.6M in compute costs

Think about it!

This is why pre-trained models are so valuable—you don't have to train from scratch!

Week 7
Models of Language and Conversation

Common Issues and Debugging 🔧

**Problems you might encounter:**



1. **Loss not decreasing**
    - Check learning rate (try 1e-4 to 3e-4)
  • Verify data pipeline
  • Check for NaN/Inf values
  1. Out of memory
    - Reduce batch size
  • Reduce sequence length
  • Use gradient accumulation
  • Enable mixed precision
  1. Poor generation quality
    - Train longer
  • Use larger model
  • Improve data quality
  • Tune sampling parameters
  1. Repetitive text
    - Increase temperature
  • Use nucleus sampling
  • Add repetition penalty
Week 7
Models of Language and Conversation

Extensions and Improvements 🚀

**Ways to enhance your GPT implementation:**



1. **Architectural improvements**
    - Rotary Position Embeddings (RoPE)
  • Flash Attention (faster attention)
  • Grouped-Query Attention
  1. Training techniques
    - Learning rate warmup and decay
  • Gradient accumulation
  • Mixed precision training (FP16/BF16)
  1. Data improvements
    - Data deduplication
  • Quality filtering
  • Curriculum learning
  1. Inference optimizations
    - KV-cache for faster generation
  • Model quantization (int8)
  • Speculative decoding
Week 7
Models of Language and Conversation

Resources for Further Learning 📚

**Recommended resources:**



- **Andrej Karpathy's nanoGPT**
Week 7
Models of Language and Conversation

Hands-On Exercise 💪

<div class="callout warning">
Your Task

Implement and train a small GPT model on your own text corpus!

**Steps:**
1. Choose a dataset (Shakespeare, Wikipedia, your own text)
  1. Set up the data pipeline

  2. Initialize the model (start small: 6 layers, 384 d_model)

  3. Train for 10-20 epochs

  4. Experiment with generation

  5. Try different sampling strategies

    Starter code available:

    • Course GitHub repository
  • Google Colab notebook
  • Estimated time: 2-3 hours
Week 7
Models of Language and Conversation

Key Takeaways 🔑

1. **GPT is conceptually simple**
    - Stack of transformer decoder blocks
  • Predict next token
  1. Key components
    - BPE tokenization
  • Token + position embeddings
  • Masked multi-head attention
  • Feed-forward networks
  1. Training requires care
    - Good data, proper hyperparameters
  • Gradient clipping, learning rate schedules
  1. Generation is an art
    - Balance creativity and coherence
  • Temperature, top-k, top-p sampling
  1. Implementation teaches you how LLMs work
    - Understanding through building!
Week 7
Models of Language and Conversation

Readings 📖

<div class="callout info">
Required Readings
  1. Vaswani et al. (2017) - "Attention is All You Need" \
    [ArXiv]
  2. Radford et al. (2018) - "Improving Language Understanding by Generative Pre-Training" \
    [PDF]
<div class="callout info">
Code Resources
Week 7
Models of Language and Conversation

Next Week 🔮

**Week 8: No Classes - Instructor Away**



**Use this week to:**
- Complete the GPT implementation exercise
  • Catch up on readings

  • Work on assignments

  • Experiment with different architectures

    Week 9: RAG & Mixture of Experts

    • Retrieval Augmented Generation
  • Mixture of Experts architectures

  • Ethics, Bias, and Safety in LLMs

Week 7
Models of Language and Conversation
    Questions? 💬

    

    Happy Coding! 🚀
Week 7

TODO: Add manual table of contents or navigation