* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication

Lecture 15: Attention Mechanisms

Week 5, Lecture 1 - From Seq2Seq to Attention

PSYC 51.07: Models of Language and Communication

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication

Today's Agenda 📋

  1. 🤔 The Context Problem: Why static embeddings aren't enough
  2. 🔄 Sequence-to-Sequence Models: The foundation
  3. ⚠️ The Bottleneck Problem: Why vanilla Seq2Seq struggles
  4. ⚡ Attention Mechanisms: The breakthrough innovation
  5. 🔍 How Attention Works: Step-by-step computation
  6. 👁️ Visualizing Attention: Interpreting the weights

Goal: Understand why and how attention revolutionized NLP

Winter 2026
PSYC 51.07: Models of Language and Communication

The Context Problem 🤔

Why do we need context-aware models?

Example: The word "bank"
  1. "I deposited money at the bank" (financial institution)
  2. "We sat by the river bank" (riverside)
  3. "The plane started to bank left" (tilt/turn)

Static Embeddings (Word2Vec):

  • One vector per word
  • Context-independent
  • "bank" = [0.2, -0.5, 0.8, ...] always

Context-Aware Models:

  • Different representation per occurrence
  • Uses surrounding context
  • Handles polysemy naturally
Winter 2026
PSYC 51.07: Models of Language and Communication

Sequence-to-Sequence Models 🔄

The breakthrough for variable-length input/output problems

Applications:

  • Machine Translation: English → French
  • Summarization: Long text → Short summary
  • Question Answering: Question + Context → Answer
  • Dialogue Systems: User input → System response
Concrete Example: Machine Translation

Input: "The cat sat on the mat" (6 tokens)
Output: "Le chat s'est assis sur le tapis" (7 tokens)

Different input/output lengths require flexible architecture!

Reference: Sutskever et al. (2014) - "Sequence to Sequence Learning with Neural Networks"

Winter 2026
PSYC 51.07: Models of Language and Communication

Encoder-Decoder Architecture 🏗️

Two-part architecture: Encode then Decode

Encoder:

  • Reads input sequence
  • Compresses to fixed-size vector
  • Captures semantic meaning

Decoder:

  • Starts from context vector
  • Generates output sequence
  • One token at a time

Worked Example:


1Input:  "The" → "cat" → "sat"
2          ↓       ↓       ↓
3        h₁  →   h₂  →   h₃ = context
4
5Output: "Le" ← "chat" ← [START]

The final hidden state h₃ summarizes the entire input!

Winter 2026
PSYC 51.07: Models of Language and Communication

The Seq2Seq Bottleneck Problem ⚠️

Challenge: All information compressed into single vector!

The Problem
  • Long sequences → information loss
  • Fixed-size context vector is a bottleneck
  • Early tokens forgotten by the time we reach the end
  • Performance degrades with sequence length
Concrete Example: Long Sentence Translation

Input (20 words): "The quick brown fox jumps over the lazy dog while the cat watches from the warm sunny windowsill nearby"

Problem: All 20 words must fit into one 256-dim vector!

  • Early words ("The quick brown") get overwritten
  • By the time we translate, we've "forgotten" the beginning

Solution: Attention! ⚡

Winter 2026
PSYC 51.07: Models of Language and Communication

Attention Mechanism: The Big Idea ⚡

Instead of compressing everything into one vector...

Let the decoder look at all encoder hidden states!


1$h_1$ -> $h_2$ -> $h_3$ -> $h_4$ -> Encoder: -> $s_t$ -> Decoder: -> Input: "The cat sat down"

Key Insight: When generating "chat" (cat), pay more attention to "cat" in the input!

Reference: Bahdanau et al. (2015) - "Neural Machine Translation by Jointly Learning to Align and Translate"

Winter 2026
PSYC 51.07: Models of Language and Communication

How Attention Works: Step by Step 🔍

Computing attention weights:

  1. Score: How relevant is each encoder state to current decoder state?

    • e_{t,i} = score(s_t, h_i) = s_t^T W_a h_i
  2. Normalize: Convert scores to probabilities (softmax)

    • alpha_{t,i} = exp(e_{t,i}) / sum_j exp(e_{t,j})
  3. Context: Weighted sum of encoder states

    • c_t = sum_i alpha_{t,i} * h_i
  4. Decode: Use context vector along with decoder state

    • s_{t+1} = f(s_t, c_t, y_t)

Result: Decoder dynamically focuses on different parts of input!

Winter 2026
PSYC 51.07: Models of Language and Communication

Worked Example: Attention Computation 📊

Translating "I love cats" → "J'aime les chats"

Step 1: Encoder produces hidden states


1h₁ = [0.2, 0.8]  ("I")
2h₂ = [0.9, 0.3]  ("love")
3h₃ = [0.4, 0.7]  ("cats")

Step 2: When generating "chats", compute scores

  • Decoder state s = [0.5, 0.6]
  • Score with h₁: s · h₁ = 0.5×0.2 + 0.6×0.8 = 0.58
  • Score with h₂: s · h₂ = 0.5×0.9 + 0.6×0.3 = 0.63
  • Score with h₃: s · h₃ = 0.5×0.4 + 0.6×0.7 = 0.62

Step 3: Softmax to get attention weights

  • alpha = softmax([0.58, 0.63, 0.62]) = [0.31, 0.35, 0.34]

Step 4: Context = weighted sum

  • c = 0.31×h₁ + 0.35×h₂ + 0.34×h₃ = [0.51, 0.58]

Model focuses most on "love" and "cats" when generating "chats"!

Winter 2026
PSYC 51.07: Models of Language and Communication

Attention Score Functions 📐

Different ways to compute the score

1. Additive (Bahdanau):


1# score = v^T * tanh(W1*s + W2*h)
2score = v @ tanh(W1 @ s + W2 @ h)
  • More parameters, flexible

2. Multiplicative (Luong):


1# score = s^T * W * h
2score = s @ W @ h
  • Simpler, faster

3. Dot-Product:


1# score = s^T * h
2score = s @ h
  • No parameters!

4. Scaled Dot-Product (Transformers):


1# score = (s^T * h) / sqrt(d)
2score = (s @ h) / sqrt(dim)
  • Prevents gradient issues

Reference: Luong et al. (2015) - "Effective Approaches to Attention-based Neural Machine Translation"

Winter 2026
PSYC 51.07: Models of Language and Communication

Attention Visualization 👁️

Example: English → French translation with attention weights

Input: "The European Economic Area"
Output: "La zone economique europeenne"


1              The    European  Economic   Area
2La           [0.8]    0.1       0.05      0.05
3zone         0.05    [0.1]      0.1      [0.75]  ← "zone" = "Area"
4economique   0.05     0.1      [0.8]      0.05
5europeenne   0.05    [0.8]      0.1       0.05   ← reordering!

Observations:

  • Diagonal pattern for similar word order
  • Model learns alignment automatically!
  • "europeenne" attends to "European" (reordering handled!)
  • No need for explicit word alignment annotations

Attention provides interpretability: we can see what the model is "looking at"

Winter 2026
PSYC 51.07: Models of Language and Communication

Benefits of Attention Mechanisms ✅

  1. Solves the Bottleneck Problem
    • Decoder has access to all encoder states
  • No information compression into single vector
  • Works well for long sequences
  1. Improves Performance
    • Better BLEU scores on translation tasks
  • Handles long-range dependencies
  • More robust to sequence length
  1. Provides Interpretability
    • Can visualize what the model focuses on
  • Helps debug and understand model behavior
  • Builds trust in model predictions
  1. Enables Better Alignment
    • Learns source-target correspondences
  • No need for external alignment tools
  • Works across different language pairs

Attention became the foundation for modern NLP!

Winter 2026
PSYC 51.07: Models of Language and Communication

Real-World Impact of Attention 🌍

Attention mechanisms revolutionized multiple domains:

Machine Translation:

  • Google Translate (2016)
  • DeepL
  • Facebook translations
  • Dramatic quality improvements

Text Summarization:

  • News article summarization
  • Document understanding
  • Email auto-responses

Question Answering:

  • Reading comprehension
  • Search engines
  • Virtual assistants

Speech Recognition:

  • Attend to acoustic features
  • Better transcription accuracy
  • Listen, Attend and Spell models
Key Milestone

By 2016, attention-based models became the standard for sequence-to-sequence tasks, paving the way for the Transformer revolution in 2017.

Winter 2026
PSYC 51.07: Models of Language and Communication

Implementing Attention in PyTorch 💻

Simple attention mechanism implementation


1import torch
2import torch.nn as nn
3import torch.nn.functional as F
4
5class BahdanauAttention(nn.Module):
6    def __init__(self, hidden_dim):
7        super().__init__()
8        self.W_dec = nn.Linear(hidden_dim, hidden_dim)
9        self.W_enc = nn.Linear(hidden_dim, hidden_dim)
10        self.v = nn.Linear(hidden_dim, 1)
11
12    def forward(self, decoder_hidden, encoder_outputs):
13        # Compute scores: how relevant is each encoder state?
14        dec = self.W_dec(decoder_hidden).unsqueeze(1)  # [batch, 1, hidden]
15        enc = self.W_enc(encoder_outputs)              # [batch, seq_len, hidden]
16        scores = self.v(torch.tanh(dec + enc))         # [batch, seq_len, 1]
17
18        # Normalize to get attention weights (sum to 1)
19        attn_weights = F.softmax(scores, dim=1)        # [batch, seq_len, 1]
20
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

Implementing Attention in PyTorch 💻


21        # Weighted sum of encoder outputs
22        context = torch.sum(attn_weights * encoder_outputs, dim=1)
23
24        return context, attn_weights.squeeze(-1)
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication

Using the Attention Module 💻

Complete example with sample data


1# Initialize attention module
2attn = BahdanauAttention(hidden_dim=64)
3
4# Sample encoder outputs (3 words, 64-dim hidden state)
5encoder_outputs = torch.randn(1, 3, 64)  # [batch=1, seq_len=3, hidden=64]
6
7# Current decoder hidden state
8decoder_hidden = torch.randn(1, 64)      # [batch=1, hidden=64]
9
10# Compute attention
11context, weights = attn(decoder_hidden, encoder_outputs)
12
13print(f"Context shape: {context.shape}")    # [1, 64]
14print(f"Attention weights: {weights}")       # [1, 3] - sums to 1.0!
15
16# Example output:
17# Attention weights: tensor([[0.28, 0.45, 0.27]])
18#                           "The" "cat" "sat"
19# Model focuses most on "cat" when generating next output word!
Winter 2026
PSYC 51.07: Models of Language and Communication

Discussion Questions 💭

  1. Why is attention called "soft alignment"?
    • How is it different from hard alignment?
  • What are the advantages of soft vs. hard?
  1. Computational Cost:
    • What is the time complexity of attention?
  • How does it scale with sequence length?
  • When might this be a problem?
  1. Interpretability:
    • Can we always trust attention weights as explanations?
  • What about when attention is uniform across all inputs?
  1. Beyond Seq2Seq:
    • Where else might attention be useful?
  • Can we apply it within a single sequence?
  • (Spoiler: Yes! That's self-attention → next lecture!)
Winter 2026
PSYC 51.07: Models of Language and Communication

Looking Ahead 🔮

What's Next?

Today we learned:

  • The context problem in NLP
  • Sequence-to-sequence architecture
  • The bottleneck problem
  • How attention mechanisms work
  • Attention as alignment and interpretation

Next lecture (Lecture 13):

  • : Attention within a sequence
  • : "Attention is All You Need"
  • : The new framework
  • : Learning diverse relationships
  • : Injecting order information

Get ready for the Transformer revolution! 🚀

Winter 2026
PSYC 51.07: Models of Language and Communication

Summary 🎯

Key Takeaways:

  1. Context Matters
    • Static embeddings can't capture context-dependent meanings
  • Need dynamic representations based on context
  1. Seq2Seq Bottleneck
    • Fixed-size context vector limits performance
  • Information loss for long sequences
  1. Attention is the Solution
    • Dynamic access to all encoder states
  • Weighted combination based on relevance
  • Attention weights sum to 1.0 (probability distribution)
  1. Benefits
    • Better performance on long sequences
  • Automatic alignment learning
  • Model interpretability

Attention mechanisms laid the foundation for modern transformer-based models!

Winter 2026
PSYC 51.07: Models of Language and Communication

References 📚

Key Papers:

  • Sutskever et al. (2014) - "Sequence to Sequence Learning with Neural Networks"

  • Introduced encoder-decoder architecture

  • Foundation for seq2seq models

    \item Bahdanau et al. (2015) - "Neural Machine Translation by Jointly Learning to Align and Translate"

    • Introduced additive attention mechanism
  • Solved the bottleneck problem

    \item Luong et al. (2015) - "Effective Approaches to Attention-based Neural Machine Translation"

    • Multiplicative attention variants
  • Global vs. local attention

    \item Vaswani et al. (2017) - "Attention Is All You Need"

    • The Transformer architecture (next lecture!)
  • Scaled dot-product attention

Additional Resources:

  • Jay Alammar's blog: "Visualizing A Neural Machine Translation Model"
  • Distill.pub: "Attention and Augmented Recurrent Neural Networks"
Winter 2026
PSYC 51.07: Models of Language and Communication

Questions? 🙋

Discussion Time

Office Hours Topics:

  • Implementing attention from scratch
  • Different attention mechanisms
  • Debugging attention-based models
  • Assignment 4 preparation

Thank you! 🙏

See you next lecture for Transformers!

Winter 2026