* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication

Lecture 16: Transformer Architecture

Week 5, Lecture 2 - Attention Is All You Need

PSYC 51.07: Models of Language and Communication

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication

Today's Agenda 📋

  1. 🤖 The Transformer Revolution: Why it changed everything
  2. 🏗️ Architecture Overview: Encoder, Decoder, and components
  3. 🎯 Self-Attention: The core mechanism (Q, K, V)
  4. 📝 Self-Attention Example: Understanding pronoun resolution
  5. 🎭 Multi-Head Attention: Learning diverse relationships
  6. 🔍 Three Types of Attention: Self, Masked, Cross

Goal: Understand the Transformer architecture and self-attention

Winter 2026
PSYC 51.07: Models of Language and Communication

The Transformer Revolution 🤖

"Attention Is All You Need"

Before Transformers (2017):

  • RNNs/LSTMs with attention
  • Sequential processing
  • Hard to parallelize
  • Limited context window
  • Slow training

After Transformers:

  • Parallel processing
  • Scales to GPUs/TPUs
  • Long-range dependencies
  • Fast & effective
Speed Comparison

Processing "The cat sat on the mat" (6 tokens):

  • RNN: 6 sequential steps (must wait for each)
  • Transformer: 1 parallel step (all tokens at once!)

Training speedup: 10-100x faster on modern hardware

Reference: Vaswani et al. (2017) - "Attention Is All You Need"

Winter 2026
PSYC 51.07: Models of Language and Communication

Why Get Rid of RNNs? 🚫

Limitations of Recurrent Architectures:

1. Sequential Bottleneck

  • Must process token t before t+1
  • Cannot parallelize across sequence

2. Long-Range Dependencies

  • Info flows through many steps
  • Gradient vanishing problems

3. Memory Constraints

  • Hidden state must remember all

Concrete Example:


1Sentence: "The cat that I saw
2yesterday at the park sat down"
3
4Token 1 ("The") to token 11 ("sat"):
5- RNN: Info passes through 10 steps
6- Gradients shrink: 0.9^10 = 0.35
7- By token 11, "The" is almost gone!
8
9Transformer: Direct connection!
10- "sat" attends directly to "cat"
11- No information degradation

Transformer Solution: Every token can attend to every other token directly!

Winter 2026
PSYC 51.07: Models of Language and Communication

Transformer Architecture Overview 🏗️


1**Encoder -> Feed Forward -> Multi-Head Attention -> Feed Forward -> Multi-Head Attention -> Input -> \textbf{Decoder -> Feed Forward

\end{center**

Key Components:

  • Multi-Head Self-Attention: Relate all positions to each other
  • Feed-Forward Networks: Transform representations
  • Residual Connections & Layer Norm: Training stability (not shown)
  • Positional Encoding: Inject position information
Winter 2026
PSYC 51.07: Models of Language and Communication

Self-Attention: The Core Mechanism 🎯

Key idea: Each word attends to all other words in the sequence

Three learned projections:

  • Query (Q): What am I looking for?
  • Key (K): What do I contain?
  • Value (V): What information do I have?

Computation:


1Q = X @ W_Q    # Transform input to queries
2K = X @ W_K    # Transform input to keys
3V = X @ W_V    # Transform input to values
4
5Attention(Q, K, V) = softmax(Q @ K.T / sqrt(d)) @ V
Intuition

Each token asks: "Which other tokens are relevant to me?" (Q vs K)
Then collects information from relevant tokens (weighted sum of V)

Winter 2026
PSYC 51.07: Models of Language and Communication

Understanding Query, Key, Value 🔑

Analogy: Database lookup or information retrieval

Information Retrieval:

  • Query: Your search query
  • Key: Document titles/keywords
  • Value: Document contents

Process:

  1. Compare query to all keys
  2. Get similarity scores
  3. Weight values by scores
  4. Return weighted combination

Self-Attention:

  • Query: What token is looking for
  • Key: What token offers
  • Value: Information from token

Process:

  1. Compare to all
  2. Get attention scores
  3. Weight all by scores
  4. Return new representation for
Key Insight

Each token simultaneously acts as:

  • A query (what it needs from other tokens)
  • A key (how it should be retrieved)
  • A value (what information it provides)
Winter 2026
PSYC 51.07: Models of Language and Communication

Scaled Dot-Product Attention 📐

Step-by-step computation with concrete example:

Input: 3 tokens, embedding dim = 4


1# Input embeddings (3 tokens x 4 dims)
2X = [[0.1, 0.2, 0.3, 0.4],   # "The"
3     [0.5, 0.6, 0.7, 0.8],   # "cat"
4     [0.2, 0.3, 0.4, 0.5]]   # "sat"
5
6# Step 1: Compute Q, K, V (using learned weights W_q, W_k, W_v)
7Q = X @ W_q  # [3 x 4]
8K = X @ W_k  # [3 x 4]
9V = X @ W_v  # [3 x 4]
10
11# Step 2: Compute attention scores
12scores = Q @ K.T  # [3 x 3] - each token vs each token
13
14# Step 3: Scale by sqrt(d_k) to prevent large values
15scores = scores / sqrt(4)  # divide by 2
16
17# Step 4: Softmax to get attention weights
18weights = softmax(scores)  # rows sum to 1
19
20# Step 5: Weighted sum of values
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

Scaled Dot-Product Attention 📐


21output = weights @ V  # [3 x 4] - new contextual embeddings
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication

Self-Attention Example: Pronoun Resolution 📝

Sentence: "The animal didn't cross the street because it was too tired"

Question: What does "it" refer to?


1Attention weights when processing "it":
2
3           The  animal  didn't  cross  the  street  because  it   was  too  tired
4"it" →    0.02  [0.45]   0.03   0.05  0.02  0.08    0.05   0.15  0.05 0.02  0.08
5
6          High attention to "animal" - model learns coreference!

Self-attention allows the model to:

  • Resolve pronouns ("it" → "animal", not "street")
  • Understand long-range dependencies (8 tokens apart!)
  • Capture syntactic and semantic relationships
  • Do this in parallel for all positions!
Winter 2026
PSYC 51.07: Models of Language and Communication

Visualizing the Attention Matrix 🔍

For sentence: "The cat sat on the mat"

To The cat sat on the mat
From
cat 0.1 0.5 0.2 0.1 0.05 0.05
sat 0.05 0.3 0.4 0.15 0.05 0.05
on 0.05 0.1 0.2 0.3 0.1 0.25
the 0.05 0.05 0.05 0.1 0.3 0.45
mat 0.05 0.05 0.1 0.2 0.2 0.4

Observations:

  • Each row sums to 1.0 (probability distribution)
  • Diagonal elements often high (self-attention)
  • "cat" attends to itself and "The" (determiner-noun relationship)
  • "mat" attends to "the" (determiner) and "on" (preposition)
  • Captures syntactic and semantic structure automatically!
Winter 2026
PSYC 51.07: Models of Language and Communication

Multi-Head Attention 🎭

Why use multiple attention heads?

Intuition:

  • Different heads learn different relationships
  • Head 1: Syntactic relationships
  • Head 2: Semantic relationships
  • Head 3: Positional patterns

Formula:


1# Each head has its own W_Q, W_K, W_V
2head_1 = Attention(Q @ W1_Q, K @ W1_K, V @ W1_V)
3head_2 = Attention(Q @ W2_Q, K @ W2_K, V @ W2_V)
4# ... more heads ...
5
6# Concatenate and project
7output = concat(head_1, head_2, ...) @ W_O

Concrete Example:


1Sentence: "The cat sat on the mat"
2
3Head 1 (syntax):
4  "sat" → "cat" (subject-verb)
5  "mat" → "the" (determiner)
6
7Head 2 (semantics):
8  "sat" → "mat" (action-location)
9  "cat" → "sat" (agent-action)
10
11Head 3 (position):
12  Each word → neighbors

BERT-base: 12 heads, BERT-large: 16 heads, GPT-3: 96 heads!

Winter 2026
PSYC 51.07: Models of Language and Communication

Why Multiple Heads? 🤔

Example: Different heads learn different patterns

Sentence: "The cat sat on the mat"

Head 1: Syntactic Dependencies

  • "cat" → "The" (noun-determiner)
  • "sat" → "cat" (verb-subject)
  • "mat" → "the" (noun-determiner)
  • Learns grammar structure

Head 2: Semantic Relations

  • "sat" → "mat" (action-location)
  • "cat" → "mat" (agent-location)
  • Learns meaning relationships

Head 3: Local Context

  • Each word → neighbors
  • Short-range dependencies
  • N-gram like patterns

Head 4: Long-Range

  • Distant word relationships
  • Document-level context
  • Coreference resolution
Ensemble Effect

Multiple heads provide a richer, more diverse representation by attending to different aspects of the input simultaneously!

Winter 2026
PSYC 51.07: Models of Language and Communication

Three Types of Attention 🔍

  1. Self-Attention (Encoder)
    • Each position attends to all positions in same sequence
  • Bidirectional: can see past and future
  • Used in: BERT, encoder-only models
  1. Masked Self-Attention (Decoder)
    • Each position attends only to previous positions
  • Prevents "looking into the future"
  • Used in: GPT, decoder-only models
  1. Cross-Attention (Encoder-Decoder)
    • Decoder attends to encoder outputs
  • Queries from decoder, Keys/Values from encoder
  • Used in: T5, BART, machine translation
Masked Self-Attention Sequence Same sequence (past only)
Cross-Attention Decoder Encoder
Winter 2026
PSYC 51.07: Models of Language and Communication

Masked Self-Attention 🎭

Preventing the model from "cheating" during generation

Problem: During training, we have the full target sequence. Without masking, the model could "peek" at future tokens!

Solution: Mask out future positions by setting attention scores to -infinity before softmax.


1# Example: Generating "The cat sat"
2# When predicting "sat", model should only see "The cat"
3
4scores = [[0.5, 0.3, 0.2],    # "The" can see: The
5          [0.4, 0.5, 0.1],    # "cat" can see: The, cat
6          [0.2, 0.4, 0.4]]    # "sat" can see: The, cat, sat
7
8# Apply causal mask (upper triangle = -infinity)
9mask = [[ 0,  -inf, -inf],
10        [ 0,   0,   -inf],
11        [ 0,   0,    0  ]]
12
13masked_scores = scores + mask
14# After softmax: future positions get weight 0!

Result: Token at position t can only attend to positions <= t

  • Maintains autoregressive property
  • Enables parallel training while preserving causality
Winter 2026
PSYC 51.07: Models of Language and Communication

Cross-Attention 🔗

Connecting encoder and decoder in seq2seq models


1Encoder Outputs ->  (Keys & Values) -> Decoder State ->  (Queries) -> Cross-Attention -> Context-Aware Decoder

Key Properties:

  • Q comes from decoder (what decoder needs)
  • K, V come from encoder (what input provides)
  • Allows decoder to "look at" relevant parts of input
  • Similar to the original attention mechanism from Lecture 12!

Used in: Machine translation, summarization, any encoder-decoder task

Winter 2026
PSYC 51.07: Models of Language and Communication

Implementing Self-Attention in PyTorch 💻

Scaled dot-product attention


1import torch
2import torch.nn as nn
3import torch.nn.functional as F
4import math
5
6class SelfAttention(nn.Module):
7    def __init__(self, embed_dim):
8        super().__init__()
9        self.embed_dim = embed_dim
10        self.W_q = nn.Linear(embed_dim, embed_dim)
11        self.W_k = nn.Linear(embed_dim, embed_dim)
12        self.W_v = nn.Linear(embed_dim, embed_dim)
13
14    def forward(self, x, mask=None):
15        Q = self.W_q(x)  # Queries: what am I looking for?
16        K = self.W_k(x)  # Keys: what do I contain?
17        V = self.W_v(x)  # Values: what info do I provide?
18
19        # Attention scores: how similar are Q and K?
20        scores = torch.matmul(Q, K.transpose(-2, -1))
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

Implementing Self-Attention in PyTorch 💻


21        scores = scores / math.sqrt(self.embed_dim)  # Scale!
22
23        if mask is not None:  # For causal/decoder attention
24            scores = scores.masked_fill(mask == 0, -1e9)
25
26        attn_weights = F.softmax(scores, dim=-1)  # Normalize
27        output = torch.matmul(attn_weights, V)    # Weighted sum
28
29        return output, attn_weights
30
31# Usage example:
32attn = SelfAttention(embed_dim=64)
33x = torch.randn(1, 5, 64)  # 5 tokens, 64-dim embeddings
34out, weights = attn(x)
35# out: [1, 5, 64] - contextualized embeddings
36# weights: [1, 5, 5] - attention matrix
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication

Computational Complexity ⚙️

Understanding the cost of self-attention

Component Time Complexity Memory
Self-Attention O(n^2 * d) O(n^2)
Feed-Forward O(n * d^2) O(d)

where n = sequence length, d = embedding dimension

Concrete Example: Memory Usage

Sequence length n = 1000 tokens, d = 768 (BERT-base)

Attention matrix size: n x n = 1000 x 1000 = 1 million entries
At fp32 (4 bytes): 4 MB per layer, per head

BERT-base: 12 layers x 12 heads = 144 attention matrices
Total: 576 MB just for attention weights!

If n = 10,000: 100x more = 57.6 GB (won't fit on most GPUs!)

Typical context limits:

  • BERT: 512 | GPT-2: 1024 | GPT-3: 2048 | GPT-4: 128k (with optimizations)
Winter 2026
PSYC 51.07: Models of Language and Communication

Discussion Questions 💭

  1. Self-Attention vs RNN Attention:
    • What's the key difference?
  • Why is self-attention more powerful?
  • When might RNNs still be useful?
  1. Query, Key, Value Framework:
    • Why three separate projections instead of one?
  • What if we used ?
  • How does this relate to information retrieval?
  1. Multi-Head Attention:
    • Why not just use one big attention head?
  • How many heads is optimal?
  • Can we interpret what each head learns?
  1. Scalability:
    • is problematic for long documents. Solutions?
  • Sparse attention? Local attention? Other ideas?
Winter 2026
PSYC 51.07: Models of Language and Communication

Looking Ahead 🔮

What's Next?

Today we learned:

  • Why transformers replaced RNNs
  • Self-attention mechanism (Q, K, V)
  • Multi-head attention
  • Three types of attention (self, masked, cross)

Next lecture (Lecture 14 - Training Transformers):

  • : How to inject position information
  • : The other key component
  • : Training stability
  • : Making transformers faster
  • : Encoder, Decoder, Encoder-Decoder
  • : Training and using transformers

We're building up to BERT and GPT! 🚀

Winter 2026
PSYC 51.07: Models of Language and Communication

Summary 🎯

Key Takeaways:

  1. Transformer Revolution
    • Pure attention, no recurrence
  • Parallel processing, faster training
  1. Self-Attention Mechanism
    • Query, Key, Value framework
  • Each token attends to all others
  • Scaled dot-product:
  1. Multi-Head Attention
    • Multiple heads learn diverse relationships
  • Concatenate and project back
  • Richer representations
  1. Three Attention Types
    • Self (encoder), Masked (decoder), Cross (encoder-decoder)
  • Different uses for different architectures

Self-attention is the foundation of modern NLP!

Winter 2026
PSYC 51.07: Models of Language and Communication

References 📚

Essential Papers:

  • Vaswani et al. (2017) - "Attention Is All You Need"

  • The original Transformer paper

  • Introduced self-attention, multi-head attention

  • Foundation of modern NLP

    \item Bahdanau et al. (2015) - "Neural Machine Translation by Jointly Learning to Align and Translate"

    • Original attention mechanism (for comparison)

Tutorials and Resources:

Winter 2026
PSYC 51.07: Models of Language and Communication

Questions? 🙋

Discussion Time

Topics for discussion:

  • Self-attention mechanism
  • Query, Key, Value intuition
  • Multi-head attention
  • Masked vs. unmasked attention
  • Implementation questions

Thank you! 🙏

Next: Training Transformers!

Winter 2026