* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
Models of Language and Conversation

Lecture 21: GPT Architecture

Generative Pre-Training for Language

Models of Language and Conversation

Week 7

Week 7
Models of Language and Conversation

Today's Journey

  1. The GPT Revolution - Pre-training + fine-tuning paradigm
  2. Transformer Decoder - Architecture walkthrough
  3. Masked Self-Attention - The core innovation
  4. Two-Stage Training - Pre-training and fine-tuning
  5. Hands-on Code - Building GPT blocks in PyTorch
  6. Key Results - What made GPT successful
Week 7
Models of Language and Conversation

The GPT Revolution

Discussion

What made GPT different from previous language models?

Key Innovation: Pre-training + Fine-tuning

  • Pre-train on massive unlabeled text (unsupervised)
  • Fine-tune on specific tasks (supervised)
  • Transfer learning for NLP
  • One model, many tasks
Paradigm Shift

Before GPT: Task-specific models trained from scratch

After GPT: General-purpose models adapted to tasks

Reference: Radford et al. (2018) - "Improving Language Understanding by Generative Pre-Training"

Week 7
Models of Language and Conversation

From BERT to GPT: Different Approaches

BERT (2018):

  • Masked Language Model
  • Bidirectional context
  • Fill in the blank
  • Great for understanding
  • Not designed for generation

Example:
"The [MASK] sat on the mat"

GPT (2018):

  • Autoregressive Model
  • Left-to-right (causal)
  • Predict next token
  • Natural for generation
  • Can also understand

Example:
"The cat sat" -> predict "on"

Think about it!

Why is autoregressive modeling more natural for text generation?

Week 7
Models of Language and Conversation

The Transformer Decoder

GPT uses the Transformer decoder architecture

Data Flow:

  1. Input tokens
  2. Token + Position embeddings
  3. N x Transformer blocks:
    • Masked self-attention
    • Feed-forward network
  4. Output logits

Key Components:

  • Embeddings: Map tokens to vectors
  • Masked Attention: Only see past tokens
  • Feed-Forward: Process each position
  • Layer Norm: Stabilize training
Key Insight

Masked attention prevents "peeking" at future tokens - essential for autoregressive generation!

Week 7
Models of Language and Conversation

Transformer Decoder: Worked Example

Example: Processing "The cat sat"


1Step 1: Token Embeddings
2"The" -> [0.2, -0.1, 0.8, ...]  (768-dim vector)
3"cat" -> [0.5, 0.3, -0.2, ...]
4"sat" -> [-0.1, 0.7, 0.4, ...]
5
6Step 2: Add Position Embeddings
7Position 0 embedding + "The" embedding
8Position 1 embedding + "cat" embedding
9Position 2 embedding + "sat" embedding
10
11Step 3: Masked Self-Attention (for each layer)
12"The" attends to: [The]
13"cat" attends to: [The, cat]
14"sat" attends to: [The, cat, sat]
15
16Step 4: Predict Next Token
17Output logits -> softmax -> "on" (highest probability)
Week 7
Models of Language and Conversation

Masked Self-Attention

The Core Innovation: Causal Masking

Attention Matrix for "The cat sat on":

The cat sat on
The 1.0 -inf -inf -inf
cat 0.3 0.7 -inf -inf
sat 0.2 0.3 0.5 -inf
on 0.1 0.2 0.3 0.4

Values show attention weights after softmax (masking sets values to -inf)

Each token can only attend to itself and previous tokens

  • Ensures autoregressive property
  • No information leakage from future
  • Allows parallel training
Week 7
Models of Language and Conversation

Causal Mask: Code Implementation


1import torch
2
3def create_causal_mask(seq_len):
4    """Create lower-triangular mask for causal attention."""
5    # Create a matrix of ones
6    mask = torch.ones(seq_len, seq_len)
7    # Keep only lower triangle (including diagonal)
8    mask = torch.tril(mask)
9    return mask
10
11# Example for sequence length 4
12mask = create_causal_mask(4)
13print(mask)
14# tensor([[1., 0., 0., 0.],
15#         [1., 1., 0., 0.],
16#         [1., 1., 1., 0.],
17#         [1., 1., 1., 1.]])
18
19# In attention: scores.masked_fill(mask == 0, float('-inf'))

This mask is applied before softmax to prevent attending to future tokens!

Week 7
Models of Language and Conversation

Position Encoding in GPT

Why we need position information:

The Problem

Self-attention is permutation invariant - it doesn't inherently know word order!

GPT's Solution: Learned Position Embeddings

  • Each position gets a learnable embedding
  • Added to token embeddings: embedding = token_emb + pos_emb
  • Model learns position patterns during training
  • Maximum sequence length: 512 tokens (GPT-1)

Alternative: Sinusoidal
(Used in original Transformer)

  • Fixed function
  • Generalizes to longer sequences

GPT: Learned
(More flexible)

  • Learned from data
  • Task-specific patterns
Week 7
Models of Language and Conversation

Position Embedding: Code Example


1import torch
2import torch.nn as nn
3
4class GPTEmbedding(nn.Module):
5    def __init__(self, vocab_size, d_model, max_seq_len):
6        super().__init__()
7        # Token embeddings: word -> vector
8        self.token_embed = nn.Embedding(vocab_size, d_model)
9        # Position embeddings: position -> vector
10        self.pos_embed = nn.Embedding(max_seq_len, d_model)
11
12    def forward(self, x):
13        seq_len = x.size(1)
14        # Create position indices [0, 1, 2, ..., seq_len-1]
15        positions = torch.arange(seq_len, device=x.device)
16        # Combine: token embedding + position embedding
17        return self.token_embed(x) + self.pos_embed(positions)
18
19# Example usage
20embed = GPTEmbedding(vocab_size=50000, d_model=768, max_seq_len=512)
continued...
Week 7
Models of Language and Conversation

Position Embedding: Code Example


21tokens = torch.tensor([[101, 2054, 2003]])  # "The cat sat"
22embeddings = embed(tokens)  # Shape: (1, 3, 768)
...continued
Week 7
Models of Language and Conversation

GPT-1 Model Specifications

Component Value
Layers 12
Hidden size 768
Attention heads 12
Max sequence length 512 tokens
Vocabulary size 40,000 (BPE)
Training data BooksCorpus (7,000 books)
Training tokens ~5 billion
Total parameters ~117 million

Training objective:

Maximize likelihood of next token given previous context.

Week 7
Models of Language and Conversation

GPT Architecture in Code


1import torch.nn as nn
2
3class GPTBlock(nn.Module):
4    def __init__(self, d_model, n_heads):
5        super().__init__()
6        self.attention = MaskedMultiHeadAttention(d_model, n_heads)
7        self.norm1 = nn.LayerNorm(d_model)
8        self.ffn = nn.Sequential(
9            nn.Linear(d_model, 4 * d_model),
10            nn.GELU(),  # GPT uses GELU, not ReLU
11            nn.Linear(4 * d_model, d_model)
12        )
13        self.norm2 = nn.LayerNorm(d_model)
14
15    def forward(self, x):
16        # Pre-norm architecture (LayerNorm before sublayer)
17        # Self-attention with residual connection
18        x = x + self.attention(self.norm1(x))
19        # Feed-forward with residual connection
20        x = x + self.ffn(self.norm2(x))
continued...
Week 7
Models of Language and Conversation

GPT Architecture in Code


21        return x
...continued
Week 7
Models of Language and Conversation

Complete GPT Model Structure


1class GPT(nn.Module):
2    def __init__(self, vocab_size, d_model, n_layers, n_heads, max_len):
3        super().__init__()
4        self.token_embed = nn.Embedding(vocab_size, d_model)
5        self.pos_embed = nn.Embedding(max_len, d_model)
6        self.blocks = nn.ModuleList([
7            GPTBlock(d_model, n_heads) for _ in range(n_layers)
8        ])
9        self.norm = nn.LayerNorm(d_model)
10        self.head = nn.Linear(d_model, vocab_size)
11
12    def forward(self, x):
13        seq_len = x.size(1)
14        positions = torch.arange(seq_len, device=x.device)
15        x = self.token_embed(x) + self.pos_embed(positions)
16        for block in self.blocks:
17            x = block(x)
18        x = self.norm(x)
19        logits = self.head(x)  # (batch, seq_len, vocab_size)
20        return logits
Week 7
Models of Language and Conversation

Two-Stage Training

Stage 1: Pre-training

  • Dataset: BooksCorpus (5B tokens)
  • Objective: Next token prediction
  • Duration: Weeks of training
  • Result: General language model

Stage 2: Fine-tuning

  • Dataset: Task-specific (1K-100K examples)
  • Objective: Task loss + LM loss
  • Duration: Hours of training
  • Result: Task-specialized model

Why this works:

  • Pre-training learns general language understanding
  • Fine-tuning adapts to specific task format
  • Much less task data needed!
Week 7
Models of Language and Conversation

Pre-training: Language Modeling

Objective: Predict next token

Example Training Sequence

Text: "The quick brown fox jumps over the lazy dog"

Training examples (teacher forcing):

  • Input: "The" -> Target: "quick"
  • Input: "The quick" -> Target: "brown"
  • Input: "The quick brown" -> Target: "fox"
  • Input: "The quick brown fox" -> Target: "jumps"
  • ...and so on

Self-supervised learning:

  • No manual labels needed!
  • Every text provides training signal
  • Can use web-scale data
  • Model learns: syntax, semantics, facts, reasoning
Week 7
Models of Language and Conversation

Pre-training: Worked Example

How loss is computed for one sequence:


1# Input sequence: "The cat sat on the"
2tokens = [101, 2054, 2003, 2006, 1996]  # Token IDs
3
4# Model predicts probability distribution for each position
5# Position 0: P(next | "The") = {"cat": 0.3, "dog": 0.2, ...}
6# Position 1: P(next | "The cat") = {"sat": 0.4, "ran": 0.1, ...}
7# etc.
8
9# Target tokens (shifted by 1)
10targets = [2054, 2003, 2006, 1996, 2282]  # "cat sat on the mat"
11
12# Cross-entropy loss at each position
13loss_0 = -log(0.3)   # P("cat" | "The")
14loss_1 = -log(0.4)   # P("sat" | "The cat")
15loss_2 = -log(0.35)  # P("on" | "The cat sat")
16# ...
17
18total_loss = mean(loss_0, loss_1, loss_2, ...)
Week 7
Models of Language and Conversation

Fine-tuning for Downstream Tasks

Task Format: Input + Delimiter + Label

Text Classification:


1[START] This movie is amazing! [DELIM]

Model predicts: "Positive"

Entailment:


1[START] Premise [DELIM] Hypothesis [DELIM]

Model predicts: "Entailment" / "Contradiction"

Question Answering:


1[START] Context [DELIM] Question [DELIM]

Model predicts answer span

Similarity:


1[START] Text1 [DELIM] Text2 [DELIM]

Model predicts similarity score

Key Insight

All tasks can be framed as text completion!

Week 7
Models of Language and Conversation

Fine-tuning: Concrete Example

Sentiment Classification Example:


1# Training example
2text = "This movie was absolutely fantastic!"
3label = "positive"
4
5# Format for GPT
6input_text = "[START] This movie was absolutely fantastic! [DELIM]"
7# Tokenize: [50256, 1212, 3807, 373, 5765, 12779, 0, 50257]
8
9# During fine-tuning:
10# 1. GPT processes the input
11# 2. Take the hidden state at [DELIM] position
12# 3. Pass through classification head
13logits = classification_head(hidden_state)  # [pos_score, neg_score]
14# 4. Compute cross-entropy with label
15loss = cross_entropy(logits, label_id)
16
17# Fine-tuning hyperparameters
18learning_rate = 6.25e-5  # Much lower than pre-training!
19batch_size = 32
20epochs = 3
Week 7
Models of Language and Conversation

Fine-tuning Implementation Details

What changes during fine-tuning?

  1. Add task-specific input transformations

    • Format inputs with delimiters
    • Add special tokens
  2. Add classification head (optional)

    • Linear layer for classification
    • Or use language modeling head
  3. Train with supervised objective

    • Cross-entropy loss
    • Much lower learning rate
    • Few epochs (3-5)

Hyperparameters:

  • Learning rate: 6.25e-5 (much lower than pre-training!)
  • Batch size: 32
  • Max epochs: 3
  • Linear learning rate decay to 0
Week 7
Models of Language and Conversation

GPT-1 Results

Performance on GLUE Benchmark:

Task Previous SOTA GPT-1
Question Answering 86.7 88.1
Semantic Similarity 85.0 85.8
Text Classification 93.0 94.2
Natural Language Inference 80.6 82.1
Key Findings
  • GPT achieved SOTA on 9 out of 12 tasks studied
  • Large improvements on tasks with less training data
  • Transfer learning works for NLP!

Reference: Radford et al. (2018) - "Improving Language Understanding by Generative Pre-Training"

Week 7
Models of Language and Conversation

Zero-Shot and Few-Shot Learning

Discussion

What if we could use the model without fine-tuning?

Definitions:

  • Zero-shot: No task-specific training examples
  • Few-shot: A few examples (1-10) as context
  • Fine-tuning: Full supervised training

GPT-1 observations:

  • Fine-tuning works best
  • Zero-shot performance is weak
  • Model has learned a lot, but needs task formatting
Think about it!

This limitation motivated GPT-2 and GPT-3: Can we make models that work zero-shot?

Week 7
Models of Language and Conversation

Zero-Shot vs Fine-tuning: Example

Task: Sentiment Classification

Zero-shot (GPT-1):


1Input: "Review: Great movie!
2Sentiment:"
3Output: ??? (unreliable)

GPT-1 might just continue the text randomly.

After Fine-tuning:


1Input: "[START] Great movie! [DELIM]"
2Output: "Positive" (reliable)

Model learned task format from examples.

Why the difference?

  • Pre-training teaches language, not task formats
  • Fine-tuning teaches: "when you see [DELIM], output a label"
  • GPT-2/3 would see enough examples in training to do this zero-shot
Week 7
Models of Language and Conversation

Visualizing What GPT Learns

Layer-by-layer analysis of GPT representations:

Layer What it learns Example
1-3 Word order, grammar "The cat" vs "cat The"
4-6 Word meanings, syntax Subject-verb agreement
7-9 Semantic relationships "bank" in different contexts
10-12 World knowledge, task "Paris is the capital of..."

Layer analysis shows:

  • Early layers: Syntactic patterns
  • Middle layers: Semantic understanding
  • Later layers: Task-specific reasoning
  • Progressive abstraction!
Week 7
Models of Language and Conversation

The Generative Pre-training Paradigm

Why "Generative Pre-Training" was revolutionary:

  1. Unified Architecture

    • Same model for all tasks
    • vs. task-specific architectures
  2. Transfer Learning

    • Learn once, apply many times
    • vs. training from scratch
  3. Scalability

    • More data -> Better performance
    • Clear path to improvement
  4. Simplicity

    • Just predict next token
    • No complex objectives
This paradigm enabled GPT-2, GPT-3, and beyond!
Week 7
Models of Language and Conversation

Limitations of GPT-1

What GPT-1 couldn't do well:

  • Zero-shot performance: Needed fine-tuning for each task
  • Model size: 117M parameters (small by today's standards)
  • Training data: Limited to ~5B tokens
  • Context length: Only 512 tokens
  • Generation quality: Sometimes incoherent
  • Factual accuracy: Prone to hallucinations
Discussion

How would you address these limitations?

(Spoiler: GPT-2 and GPT-3 tried to solve these!)

Week 7
Models of Language and Conversation

Comparison with BERT

Aspect BERT GPT
Training objective Masked LM Autoregressive LM
Context Bidirectional Left-to-right
Best for Understanding Generation
Fine-tuning Task-specific heads Unified format
Parameters (base) 110M 117M
Think about it!

BERT dominated understanding tasks (2018-2019), but GPT's approach proved more scalable and versatile. Why?

Answer: Autoregressive modeling naturally supports generation, which unlocks zero-shot and few-shot capabilities!

Week 7
Models of Language and Conversation

Key Takeaways

  1. GPT introduced generative pre-training

    • Transformer decoder architecture
    • Autoregressive language modeling
  2. Two-stage training paradigm

    • Unsupervised pre-training on massive text
    • Supervised fine-tuning on task data
  3. Masked self-attention is key

    • Enables autoregressive generation
    • Prevents information leakage
  4. Transfer learning for NLP

    • Learn general patterns once
    • Apply to many tasks
  5. Foundation for modern LLMs

    • GPT-2, GPT-3, ChatGPT all build on this
Week 7
Models of Language and Conversation

Readings

Required Reading

Radford et al. (2018) - "Improving Language Understanding by Generative Pre-Training"

[PDF]

Recommended Readings
  • Vaswani et al. (2017) - "Attention is All You Need" [ArXiv]
  • Devlin et al. (2018) - "BERT: Pre-training of Deep Bidirectional Transformers" [ArXiv]
  • The Illustrated GPT-2 by Jay Alammar [Blog]
Week 7
Models of Language and Conversation

Next Lecture Preview

Lecture 22: Scaling Up to GPT-3 and Beyond
  • GPT-2: Language models as unsupervised multitask learners
  • GPT-3: Few-shot learning at scale
  • Scaling laws: Why bigger is better
  • Emergent abilities of large language models
  • From GPT-3 to ChatGPT

Questions?

Week 7