* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication

Lecture 17: Training Transformers

Week 5, Lecture 3 - From Architecture to Implementation

PSYC 51.07: Models of Language and Communication

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication

Today's Agenda 📋

  1. 📍 Positional Encoding: Injecting sequence order
  2. 🔄 Feed-Forward Networks: The other key component
  3. 🔗 Layer Norm & Residuals: Training deep networks
  4. ⚡ FlashAttention: Making transformers faster
  5. 🏗️ Three Architectures: Encoder, Decoder, Both
  6. 💻 Practical Implementation: Using HuggingFace

Goal: Complete understanding of transformer training and implementation

Winter 2026
PSYC 51.07: Models of Language and Communication

Positional Encoding 📍

Problem: Self-attention is permutation-invariant!

Without position information:

  • "The cat sat" = "sat cat the" = "cat the sat"
  • Order matters in language!

Solution: Add positional encodings to input embeddings

Original Transformer (Sinusoidal):


1# PE(pos, 2i) = sin(pos / 10000^(2i/d))
2# PE(pos, 2i+1) = cos(pos / 10000^(2i/d))
3
4# Position 0, dim 0: sin(0/10000^0) = 0
5# Position 1, dim 0: sin(1/10000^0) = 0.84
6# Position 2, dim 0: sin(2/10000^0) = 0.91

Properties:

  • Deterministic, unique per position
  • Generalizes to unseen lengths

Concrete Example:


1Token embeddings:
2"The" = [0.2, 0.5, 0.1, 0.8]
3"cat" = [0.9, 0.3, 0.7, 0.2]
4
5Positional encodings:
6pos_0 = [0.0, 1.0, 0.0, 1.0]
7pos_1 = [0.84, 0.54, 0.01, 1.0]
8
9Final input (add them):
10"The" = [0.2, 1.5, 0.1, 1.8]
11"cat" = [1.74, 0.84, 0.71, 1.2]

References: Vaswani et al. (2017), Su et al. (2021)

Winter 2026
PSYC 51.07: Models of Language and Communication

Why Sinusoidal Positional Encoding? 🌊

Advantages of sine/cosine functions:

1. Unique Patterns

  • Each position gets unique vector
  • Different frequencies per dimension

2. Relative Position Info

  • PE(pos+k) = linear function of PE(pos)
  • Model learns relative positions

3. Extrapolation

  • Works for sequences longer than training
  • No need to retrain

4. No Parameters

  • Deterministic, saves memory

Visualization:


1Dim 0 (high freq): ~~~~~ (fast oscillation)
2Dim 1 (mid freq):  ~~~   (medium)
3Dim 2 (low freq):  ~     (slow)
4
5Position 0: [0.0, 0.0, 0.0, ...]
6Position 1: [0.84, 0.01, 0.0001, ...]
7Position 2: [0.91, 0.02, 0.0002, ...]
8...
9Position 100: [0.51, 0.86, 0.01, ...]
10
11Each position has a unique "barcode"!
Winter 2026
PSYC 51.07: Models of Language and Communication

Visualizing Positional Encodings 👁️

Each position gets a unique pattern across dimensions


1Pos 0 -> Pos 5 -> Pos 9 -> Dim 0 -> Dim 4 -> Dim 7

Key Insight:

  • Lower dimensions: Rapid oscillation (high frequency)
  • Higher dimensions: Slow oscillation (low frequency)
  • Creates a unique "barcode" for each position
  • Model learns to use these patterns for positional awareness
Winter 2026
PSYC 51.07: Models of Language and Communication

Feed-Forward Networks 🔄

After attention, apply position-wise feed-forward network

Architecture: Two linear layers with ReLU/GELU activation


1class FeedForward(nn.Module):
2    def __init__(self, d_model=768, d_ff=3072):  # 4x expansion
3        super().__init__()
4        self.linear1 = nn.Linear(d_model, d_ff)    # 768 → 3072
5        self.linear2 = nn.Linear(d_ff, d_model)    # 3072 → 768
6        self.relu = nn.ReLU()
7
8    def forward(self, x):
9        # x: [batch, seq_len, 768]
10        x = self.linear1(x)   # [batch, seq_len, 3072]
11        x = self.relu(x)      # Non-linearity!
12        x = self.linear2(x)   # [batch, seq_len, 768]
13        return x
14
15# Applied to each position independently
16# Same weights for all positions

Purpose: Add non-linearity and increase model capacity

  • Attention is mostly linear (weighted sums)
  • FFN adds expressiveness through ReLU/GELU
Winter 2026
PSYC 51.07: Models of Language and Communication

Why Feed-Forward Networks? 🤔

Role in the Transformer:

  1. Add Non-linearity
    • Attention is mostly linear operations (weighted sums)
  • FFN introduces non-linear transformations
  • ReLU/GELU activation adds expressiveness
  1. Position-wise Processing
    • Attention mixes information across positions
  • FFN processes each position independently
  • Allows position-specific feature transformations
  1. Increase Model Capacity
    • Expansion (4x) provides more parameters
  • Can learn complex feature combinations
  • Most parameters in transformer are in FFN layers!
  1. Feature Refinement
    • Attention gathers context
  • FFN refines and transforms the representation
  • Two complementary operations
Winter 2026
PSYC 51.07: Models of Language and Communication

Layer Normalization & Residual Connections 🔗

Critical for training deep transformers!

Residual Connections:


1# output = sublayer(x) + x
2x = x + self.attention(x)
3x = x + self.feedforward(x)
  • Gradients flow directly through
  • Prevents vanishing gradients
  • Enables 96-layer models (GPT-3)!

Layer Normalization:


1# Normalize across features (not batch)
2def layer_norm(x, gamma, beta):
3    mean = x.mean(dim=-1)
4    std = x.std(dim=-1)
5    return gamma * (x - mean) / std + beta
6
7# Example: x = [0.2, 0.8, 0.5]
8# mean=0.5, std=0.25
9# normalized = [-1.2, 1.2, 0.0]

Standard Pattern (Post-Norm):


1x = LayerNorm(x + Attention(x))
2x = LayerNorm(x + FeedForward(x))
Winter 2026
PSYC 51.07: Models of Language and Communication

Pre-Norm vs Post-Norm 🔄

Two ways to arrange LayerNorm and residual connections

Post-Norm (Original):


1# Normalize AFTER residual
2x = LayerNorm(x + Attention(x))
3x = LayerNorm(x + FFN(x))
  • Used in original Transformer, BERT
  • Can be unstable for deep models
  • Requires careful initialization

Pre-Norm (Modern):


1# Normalize BEFORE sublayer
2x = x + Attention(LayerNorm(x))
3x = x + FFN(LayerNorm(x))
  • Used in GPT-2, GPT-3, LLaMA
  • More stable gradients
  • Easier to train 96+ layer models
Recommendation

For deep transformers (24+ layers), Pre-Norm is preferred due to better training stability. Post-Norm can achieve slightly better final performance with careful tuning.

Winter 2026
PSYC 51.07: Models of Language and Communication

Complete Transformer Block 🧱

Putting it all together:


1class TransformerBlock(nn.Module):
2    def __init__(self, d_model=768, n_heads=12, d_ff=3072):
3        super().__init__()
4        self.attention = MultiHeadAttention(d_model, n_heads)
5        self.ffn = FeedForward(d_model, d_ff)
6        self.norm1 = nn.LayerNorm(d_model)
7        self.norm2 = nn.LayerNorm(d_model)
8
9    def forward(self, x):
10        # Self-attention with residual
11        x = x + self.attention(self.norm1(x))
12        # Feed-forward with residual
13        x = x + self.ffn(self.norm2(x))
14        return x
15
16# Stack N blocks!
17encoder = nn.Sequential(*[TransformerBlock() for _ in range(12)])

Model sizes:

  • BERT-base: 12 blocks, 110M params | BERT-large: 24 blocks, 340M params
  • GPT-3: 96 blocks, 175B params!
Winter 2026
PSYC 51.07: Models of Language and Communication

FlashAttention: Making Transformers Faster ⚡

Problem: Standard attention is slow and memory-hungry!

Standard Attention Complexity
  • Time: O(n^2) where n = sequence length
  • Memory: O(n^2) to store attention matrix
  • Bottleneck: Reading/writing to GPU memory (HBM)

FlashAttention Innovation:

  • Tile-based computation
  • Uses fast SRAM instead of slow HBM
  • Fused operations (fewer memory reads)
  • Recomputation in backward pass

1# Standard: materialize full n×n matrix
2attn = softmax(Q @ K.T / sqrt(d))
3out = attn @ V  # O(n^2) memory
4
5# FlashAttention: compute in tiles
6for tile in tiles:
7    # Only load small tile to SRAM
8    # Never materialize full matrix!

Concrete Speedup:

Method Speed Memory
Standard 1x 1x
FlashAttention 3x faster 0.5x

Real Impact:

  • Sequence 1024→4096 on same GPU
  • Training 2-4x faster
  • Used in: LLaMA, GPT-4, Mistral

Reference: Dao et al. (2022) - "FlashAttention"

Winter 2026
PSYC 51.07: Models of Language and Communication

Other Attention Optimizations ⚙️

Addressing the problem:

  1. Sparse Attention
    • Only attend to subset of positions
  • Local windows + global tokens
  • Used in: Longformer, BigBird
  1. Linear Attention
    • Approximate attention with linear complexity
  • Kernel trick to avoid materializing attention matrix
  • Used in: Performer, Linear Transformer
  1. Low-Rank Approximation
    • Factorize attention matrix
  • Reduce memory footprint
  • Used in: Linformer
  1. Sliding Window
    • Fixed-size local attention window
  • Constant memory usage
  • Used in: Mistral 7B

Trade-off: Efficiency vs. expressiveness. Full attention often still best for quality.

Winter 2026
PSYC 51.07: Models of Language and Communication

Three Transformer Architectures 🏗️

p{4cm}p{4cm}} Architecture How it works **Examples \ Uses**
Best for: Classification, NER, QA
Best for: Generation, completion
Decoder: masked + cross-attention T5, BART, mT5
Best for: Translation, summarization
Key Difference: Attention Masking
  • Encoder: Full self-attention (bidirectional)
  • Decoder: Causal/masked attention (unidirectional)
Winter 2026
PSYC 51.07: Models of Language and Communication

Visual Comparison: Encoder vs Decoder vs Both


1**Encoder-Only (BERT) -> Self-Attention ->  bidirectional -> \textbf{Decoder-Only (GPT) -> Masked Attn ->  causal

\end{center**

Choosing the Right Architecture:

  • Need to understand full context? → Encoder (BERT)
  • Need to generate text? → Decoder (GPT)
  • Need both (translate, summarize)? → Encoder-Decoder (T5)
Winter 2026
PSYC 51.07: Models of Language and Communication

Using Transformers in Practice 💻

HuggingFace makes it easy!


1from transformers import AutoModel, AutoTokenizer
2
3# Load pre-trained model (downloads ~440MB first time)
4model_name = "bert-base-uncased"
5tokenizer = AutoTokenizer.from_pretrained(model_name)
6model = AutoModel.from_pretrained(model_name)
7
8# Tokenize input
9text = "The animal didn't cross the street because it was too tired"
10inputs = tokenizer(text, return_tensors="pt")
11print(inputs['input_ids'])
12# tensor([[  101,  1996,  4111,  2134,  1005,  1056,  2892,  1996,
13#           2395,  2138,  2009,  2001,  2205,  5458,   102]])
14#         [CLS]  The  animal didn  '     t    cross  the ...
15
16# Get contextualized embeddings
17outputs = model(**inputs)
18hidden_states = outputs.last_hidden_state  # [1, 15, 768]
19
20# "it" is at position 10 - its embedding knows it refers to "animal"!
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

Using Transformers in Practice 💻


21it_embedding = hidden_states[0, 10, :]  # 768-dim context-aware vector
...continued

Reference: HuggingFace Course - Chapter 1.4

Winter 2026
PSYC 51.07: Models of Language and Communication

Comparing Architectures: Code Examples 💻

Different architectures for different tasks


1from transformers import AutoModelForSequenceClassification, \
2                         AutoModelForCausalLM, \
3                         AutoModelForSeq2SeqLM
4
5# 1. ENCODER-ONLY (BERT): Classification
6encoder_model = AutoModelForSequenceClassification.from_pretrained(
7    "bert-base-uncased", num_labels=2
8)
9# Use for: Sentiment analysis, NER, classification
10
11# 2. DECODER-ONLY (GPT): Text generation
12decoder_model = AutoModelForCausalLM.from_pretrained("gpt2")
13# Use for: Text completion, creative writing, few-shot learning
14
15# 3. ENCODER-DECODER (T5): Seq2Seq tasks
16seq2seq_model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
17# Use for: Translation, summarization, question answering
18
19# All use transformer architecture, different attention patterns!

Key Takeaway: Choose architecture based on your task!

Winter 2026
PSYC 51.07: Models of Language and Communication

Practical Tips for Training Transformers 💡

1. Learning Rate & Warmup


1# Warmup: gradually increase LR
2# Then decay (linear or cosine)
3scheduler = get_linear_schedule_with_warmup(
4    optimizer,
5    num_warmup_steps=1000,  # ~10% of training
6    num_training_steps=10000
7)
8# Fine-tuning: lr=2e-5, From scratch: lr=1e-4

2. Optimizer


1optimizer = AdamW(
2    model.parameters(),
3    lr=2e-5,
4    weight_decay=0.01  # L2 regularization
5)

3. Regularization


1# Dropout after attention and FFN
2self.dropout = nn.Dropout(0.1)
3
4# Gradient clipping
5torch.nn.utils.clip_grad_norm_(
6    model.parameters(), max_norm=1.0
7)

4. Mixed Precision (FP16)


1from torch.cuda.amp import autocast
2
3with autocast():  # Use FP16
4    outputs = model(inputs)
5    loss = criterion(outputs, labels)
6# 2x faster, 2x less memory!
Winter 2026
PSYC 51.07: Models of Language and Communication

Computational Efficiency Tips ⚙️

  1. Batch Size
    • Larger batches = better GPU utilization
  • Use gradient accumulation if GPU memory limited
  • Typical: effective batch size 256-2048 tokens
  1. Sequence Length
    • Shorter sequences train faster (quadratic complexity!)
  • Consider truncation or sliding windows
  • Pack multiple examples to maximize GPU usage
  1. Model Size
    • Start small, scale up if needed
  • DistilBERT: 40% smaller, 60% faster, 97% performance
  • Consider model distillation for deployment
  1. Hardware
    • GPUs with high memory bandwidth (A100, H100)
  • Multi-GPU training with data parallelism
  • Use FlashAttention when available
Winter 2026
PSYC 51.07: Models of Language and Communication

Discussion Questions 💭

  1. Positional Encoding:
    • Why add instead of concatenate?
  • What happens without positional encoding?
  • Learned vs. fixed: which is better?
  1. Architecture Choice:
    • When would you use encoder-only vs decoder-only?
  • Can GPT do classification? Can BERT generate?
  • Why has decoder-only become more popular recently?
  1. Scaling:
    • Is bigger always better?
  • What are the limits to scaling transformers?
  • How do we make them more efficient?
  1. Training Stability:
    • Why are residual connections so important?
  • Pre-norm vs post-norm: trade-offs?
Winter 2026
PSYC 51.07: Models of Language and Communication

Looking Ahead to Week 6 🔮

This week (Week 5) we learned:

  • Attention mechanisms (Lecture 12)
  • Self-attention and transformer architecture (Lecture 13)
  • Training transformers: all the components (Lecture 14)

Next week (Week 6):

  • Masked Language Modeling

  • Pre-training and fine-tuning

  • Contextual embeddings in action

    \item

    • RoBERTa, ALBERT, DistilBERT
  • Improvements and optimizations

    \item

    • Real-world BERT applications
  • Cognitive neuroscience connections

  • Understanding vs. pattern matching

Assignment 4: Building context-aware systems (check syllabus for details)

Winter 2026
PSYC 51.07: Models of Language and Communication

Summary 🎯

Key Takeaways:

  1. Positional Encoding
    • Sine/cosine functions inject position information
  • Enables model to understand order
  1. Feed-Forward Networks
    • Position-wise transformations
  • Add non-linearity and capacity
  1. LayerNorm & Residuals
    • Critical for training deep networks
  • Stabilize gradients, enable deeper models
  1. Three Architectures
    • Encoder (BERT), Decoder (GPT), Both (T5)
  • Choose based on task requirements
  1. Practical Considerations
    • Use HuggingFace for easy implementation
  • FlashAttention for efficiency
  • Careful hyperparameter tuning
Winter 2026
PSYC 51.07: Models of Language and Communication

References 📚

Essential Papers:

  • Vaswani et al. (2017) - "Attention Is All You Need"

  • The original Transformer paper

    \item Su et al. (2021) - "RoFormer: Enhanced Transformer with Rotary Position Embedding"

    • Modern positional encoding approach

    \item Dao et al. (2022) - "FlashAttention: Fast and Memory-Efficient Exact Attention"

    • Making transformers faster

Tutorials:

Winter 2026
PSYC 51.07: Models of Language and Communication

Questions? 🙋

Discussion Time

Topics for discussion:

  • Positional encoding approaches
  • Architecture choices
  • Training tips and tricks
  • Implementation questions
  • Assignment 4 preparation

Thank you! 🙏

See you next week for BERT!

Winter 2026