* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication

Lecture 19: BERT Variants

Week 6, Lecture 2 - Improvements and Optimizations

PSYC 51.07: Models of Language and Communication

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication

Today's Agenda 📋

  1. 🚀 RoBERTa: Robustly Optimized BERT
  2. 🔬 ALBERT: A Lite BERT with parameter sharing
  3. ⚡ DistilBERT: Knowledge distillation for efficiency
  4. 🔌 ELECTRA: Replace Token Detection
  5. 📊 Comparative Analysis: When to use which variant
  6. 💻 Practical Considerations: Model selection guide

Goal: Understand improvements to BERT and choose the right model

Winter 2026
PSYC 51.07: Models of Language and Communication

BERT's Limitations ⚠️

What could be improved?

  1. Training Procedure
    • Some choices seemed arbitrary
  • NSP task might not be useful
  • Static masking (same masks every epoch)
  1. Model Size
    • 110M (Base) or 340M (Large) parameters
  • Large memory footprint
  • Slow inference
  1. Training Efficiency
    • Only 15% of tokens are predicted
  • 85% of computation "wasted"?
  • Could we learn more efficiently?
  1. Data and Compute
    • Trained on limited data (3.3B words)
  • Modern datasets much larger
  • Could benefit from more training

Many variants address these issues!

Winter 2026
PSYC 51.07: Models of Language and Communication

RoBERTa: Robustly Optimized BERT 🚀

Key idea: Better training = Better performance

RoBERTa's Improvements (Liu et al. 2019):

  1. Remove NSP Task
    • Next Sentence Prediction hurt performance
  • Use only Masked Language Modeling
  • Full sentences (don't need sentence pairs)
  1. Dynamic Masking
    • Generate masking pattern every time
  • BERT: static masks (same for every epoch)
  • More diverse training signal
  1. Larger Batches, More Data
    • Batch size: 8K sequences (vs BERT's 256)
  • 160GB text (vs BERT's 16GB)
  • Longer training (500K steps vs 100K)
  1. Longer Sequences
    • Train with longer sequences
  • Better for downstream tasks

Reference: Liu et al. (2019) - "RoBERTa: A Robustly Optimized BERT Pretraining Approach"

Winter 2026
PSYC 51.07: Models of Language and Communication

RoBERTa Results 📈

Consistent improvements over BERT

SQuAD 2.0 83.1 89.4 +6.3
MNLI 86.7 90.2 +3.5
SST-2 94.9 96.4 +1.5
RACE 72.0 83.2 +11.2

Key Findings:

  • Dynamic masking better than static
  • More data + longer training = better results
  • RoBERTa-Large matches or beats BERT-Large on all tasks
  • Sometimes huge gains (RACE: +11.2 points!)
Takeaway

Training procedure matters as much as architecture! RoBERTa shows that BERT was undertrained.

Winter 2026
PSYC 51.07: Models of Language and Communication

Dynamic vs Static Masking 🎭

How masking patterns are generated

Static Masking (BERT):

Preprocessing:

  • Mask tokens once
  • Save masked dataset
  • Same masks every epoch

Epoch 1: "My [MASK] is cute"

Epoch 2: "My [MASK] is cute"

Epoch 3: "My [MASK] is cute"

Problem:

  • Model sees same masks repeatedly
  • Less diverse training signal
  • Potential overfitting to mask patterns

Dynamic Masking (RoBERTa):

On-the-fly:

  • Generate masks during training
  • Different masks each time
  • More variety

Epoch 1: "My [MASK] is cute"

Epoch 2: "My dog [MASK] cute"

Epoch 3: "My dog is [MASK]"

Benefits:

  • More diverse training examples
  • Better generalization
  • Prevents memorization

Cost: Slight computational overhead, but worth it!

Winter 2026
PSYC 51.07: Models of Language and Communication

ALBERT: A Lite BERT 🔬

Key idea: Parameter sharing for efficiency

ALBERT's Innovations (Lan et al. 2019):

1. Factorized Embedding


1# BERT: Direct embedding
2# 30K vocab × 768 hidden = 23M params
3bert_embed = nn.Embedding(30000, 768)
4
5# ALBERT: Two-step embedding
6# 30K × 128 + 128 × 768 = 3.8M + 0.1M
7albert_embed = nn.Embedding(30000, 128)
8albert_project = nn.Linear(128, 768)
9# Savings: 83% fewer embedding params!

2. Cross-Layer Sharing

  • All 12 layers share same weights
  • 89% parameter reduction

3. Sentence Order Prediction


1# NSP (BERT): Is B after A? (too easy)
2# SOP (ALBERT): Are A,B in order?
3#   - Positive: [A, B] (correct order)
4#   - Negative: [B, A] (swapped order)
5# Harder task → better representations

Reference: Lan et al. (2019) - "ALBERT: A Lite BERT for Self-supervised Learning"

Winter 2026
PSYC 51.07: Models of Language and Communication

ALBERT Parameter Efficiency 📊

Dramatic parameter reduction!

ALBERT-base 12 768 12M
ALBERT-large 24 1024 18M
ALBERT-xlarge 24 2048 60M
ALBERT-xxlarge 12 4096 235M

Key Observations:

  • ALBERT-base: than BERT-base
  • Can train much larger hidden sizes with same memory
  • ALBERT-xxlarge: 4096 hidden dim, still only 235M params
  • Trade-off: fewer params but similar computation (layer sharing)
Benefits
  • Lower memory footprint
  • Easier to deploy
  • Can scale to larger hidden dimensions
Winter 2026
PSYC 51.07: Models of Language and Communication

Cross-Layer Parameter Sharing 🔗

How ALBERT achieves parameter efficiency

BERT (No Sharing):


1class BERT:
2    def __init__(self):
3        # Each layer has unique parameters
4        self.layers = [
5            TransformerLayer() for _ in range(12)
6        ]
7        # 12 × 7M params = 85M params
8
9    def forward(self, x):
10        for layer in self.layers:
11            x = layer(x)  # Different weights
12        return x

ALBERT (Full Sharing):


1class ALBERT:
2    def __init__(self):
3        # Single shared layer!
4        self.shared_layer = TransformerLayer()
5        # 1 × 7M params = 7M params
6
7    def forward(self, x):
8        for _ in range(12):
9            x = self.shared_layer(x)  # Same weights!
10        return x

Intuition: Each layer refines the representation. Like a "residual network unrolled" - iterative refinement with shared weights.

Trade-off: 89% fewer parameters, same compute (still 12 forward passes)

Winter 2026
PSYC 51.07: Models of Language and Communication

DistilBERT: Knowledge Distillation ⚡

Key idea: Train small model to mimic large model


1# Knowledge Distillation Training Loop
2teacher = BertModel.from_pretrained("bert-base")  # 12 layers, frozen
3student = DistilBertModel(num_layers=6)            # 6 layers, trainable
4
5for batch in training_data:
6    # Teacher provides "soft targets" (probability distributions)
7    with torch.no_grad():
8        teacher_logits = teacher(batch)  # e.g., [0.7, 0.2, 0.1, ...]
9
10    # Student tries to match teacher's distribution
11    student_logits = student(batch)
12
13    # Distillation loss: KL divergence between distributions
14    # Temperature T=2 softens the distribution (more informative)
15    loss_distill = KL_divergence(
16        softmax(student_logits / T),
17        softmax(teacher_logits / T)
18    )
19
20    # Also include MLM loss for language modeling
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

DistilBERT: Knowledge Distillation ⚡


21    loss_mlm = masked_lm_loss(student_logits, labels)
22
23    # Combined loss
24    loss = 0.5 * loss_distill + 0.5 * loss_mlm
...continued

Why soft targets work: Teacher's "wrong" predictions contain information (e.g., "dog" → "cat" more likely than "car")

Reference: Sanh et al. (2019) - "DistilBERT, a distilled version of BERT"

Winter 2026
PSYC 51.07: Models of Language and Communication

DistilBERT Results 📊

Significant efficiency gains!

Inference Speed 1x 1.6x 60% faster
GLUE Score 79.6 77.0 97% retained

Performance on Specific Tasks:

SST-2 (Acc) 94.9 92.7
MNLI (Acc) 86.7 82.2

When to Use DistilBERT:

  • Production deployment (latency-critical)
  • Edge devices (limited memory)
  • High-throughput scenarios
  • When 2-3% performance drop is acceptable
Winter 2026
PSYC 51.07: Models of Language and Communication

ELECTRA: Efficient Learning 🔌

Key idea: Learn from all tokens, not just 15%


1# ELECTRA Training: Generator + Discriminator setup
2sentence = "The chef cooked a delicious meal"
3masked   = "The chef [MASK] a delicious meal"
4
5# Small generator (like BERT) fills in masks
6generator_output = generator(masked)
7# Generator predicts: "ate" (plausible but wrong)
8
9corrupted = "The chef ate a delicious meal"
10
11# Discriminator classifies EACH token: original or replaced?
12discriminator_output = discriminator(corrupted)
13# Output per token: [orig, orig, REPLACED, orig, orig, orig]
14
15# Loss computed on ALL tokens (not just 15%!)
16labels = [0, 0, 1, 0, 0, 0]  # 1 = replaced
17loss = binary_cross_entropy(discriminator_output, labels)
Efficiency Gain

BERT: Learns from 15% of tokens (masked ones only)
ELECTRA: Learns from 100% of tokens (all get labeled)

Result: Same quality with 4x less compute!

Reference: Clark et al. (2020) - "ELECTRA: Pre-training Text Encoders as Discriminators"

Winter 2026
PSYC 51.07: Models of Language and Communication

ELECTRA Benefits ⚡

More efficient pre-training

Advantages:

  1. Sample Efficiency
    • Learn from all tokens (100%) vs only masked (15%)
  • Reaches same performance with less data
  • Faster convergence
  1. Better Performance
    • ELECTRA-Small outperforms BERT-Small
  • ELECTRA-Base competitive with BERT-Large
  • With same compute, ELECTRA is better
  1. Computational Efficiency
    • Smaller generator (1/4 to 1/2 size of discriminator)
  • Faster to train than BERT
  • Lower computational cost for same quality
Key Insight

Replace Token Detection is more sample-efficient than Masked LM because it provides a learning signal for every token!

Winter 2026
PSYC 51.07: Models of Language and Communication

BERT Variants Comparison 📊

Summary of key variants

p{3cm}p{3cm}} Model Key Innovation Advantages Best For

General Guidelines:

  • Best quality: RoBERTa-Large
  • Best efficiency: DistilBERT
  • Limited memory: ALBERT
  • Limited training budget: ELECTRA
  • Good default: RoBERTa-Base or BERT-Base
Winter 2026
PSYC 51.07: Models of Language and Communication

Other Notable BERT Variants 🌟

The BERT family keeps growing!

  1. DeBERTa (Microsoft, 2020)
    • Disentangled attention (separate content and position)
  • Enhanced mask decoder
  • State-of-the-art on SuperGLUE
  1. ERNIE (Baidu, 2019)
    • Entity-level and phrase-level masking
  • Knowledge enhancement
  • Strong on Chinese NLP tasks
  1. SpanBERT (Facebook, 2019)
    • Mask random spans instead of random tokens
  • Span boundary objective
  • Better for span-based tasks (QA, coreference)
  1. BART (Facebook, 2019)
    • Encoder-decoder (not encoder-only)
  • Denoising autoencoder with various corruptions
  • Excellent for generation tasks
Winter 2026
PSYC 51.07: Models of Language and Communication

Model Selection Guide 🎯

How to choose the right model for your task


1Start -> Quality or Speed? -> RoBERTa-Large -> Memory? -> DistilBERT -> ALBERT -> \begin{tabular

Additional Considerations:

  • Domain: Consider domain-specific pre-trained models (BioBERT, SciBERT, etc.)
  • Language: Multilingual? Use mBERT, XLM-R
  • Task type: Generation? Consider BART/T5 instead
Winter 2026
PSYC 51.07: Models of Language and Communication

Using Different BERT Variants 💻

Easy switching with HuggingFace


1from transformers import AutoModel, AutoTokenizer
2
3# BERT
4bert_model = AutoModel.from_pretrained("bert-base-uncased")
5bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
6
7# RoBERTa
8roberta_model = AutoModel.from_pretrained("roberta-base")
9roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
10
11# ALBERT
12albert_model = AutoModel.from_pretrained("albert-base-v2")
13albert_tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
14
15# DistilBERT
16distilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")
17distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
18
19# ELECTRA
20electra_model = AutoModel.from_pretrained("google/electra-base-discriminator")
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

Using Different BERT Variants 💻


21electra_tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
22
23# All have the same API!
24inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
25outputs = model(**inputs)
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication

Benchmarking BERT Variants: Worked Example 💻

Practical comparison on sentiment analysis


1import time
2from transformers import pipeline
3
4# Load different models for sentiment analysis
5models = {
6    "bert-base": "textattack/bert-base-uncased-SST-2",
7    "distilbert": "distilbert-base-uncased-finetuned-sst-2-english",
8    "albert": "textattack/albert-base-v2-SST-2",
9}
10
11test_texts = ["This movie was fantastic!", "I hated every minute of it."] * 100
12
13for name, model_id in models.items():
14    pipe = pipeline("sentiment-analysis", model=model_id)
15
16    start = time.time()
17    results = pipe(test_texts)
18    elapsed = time.time() - start
19
20    print(f"{name}: {elapsed:.2f}s for 200 samples ({200/elapsed:.1f} samples/sec)")

Typical Results:

Model Accuracy Speed (samples/sec) Memory
bert-base 93.2% 45 420MB
distilbert 91.3% 85 250MB
albert 92.7% 38 45MB
Winter 2026
PSYC 51.07: Models of Language and Communication

Discussion Questions 💭

  1. Training vs Architecture:
    • RoBERTa shows training matters. Is architecture overrated?
  • How much can we improve with just better training?
  • What's the right balance?
  1. Parameter Efficiency:
    • ALBERT shares all layers. Why does this work?
  • What are the limits of parameter sharing?
  • Is there a "sweet spot"?
  1. Knowledge Distillation:
    • Why does student learn better from teacher than from labels?
  • What information is in the soft probabilities?
  • Can we distill even further?
  1. Model Selection:
    • How do you decide which model to use?
  • Is it worth fine-tuning multiple variants?
  • What about model ensembles?
Winter 2026
PSYC 51.07: Models of Language and Communication

Looking Ahead 🔮

Today we learned:

  • RoBERTa: Better training matters
  • ALBERT: Parameter sharing for efficiency
  • DistilBERT: Knowledge distillation
  • ELECTRA: Replace token detection
  • How to choose the right variant

Next lecture (Lecture 17 - Applications of Encoder Models):

From theory to practice! 🚀

Winter 2026
PSYC 51.07: Models of Language and Communication

Summary 🎯

Key Takeaways:

  1. RoBERTa
    • Training procedure matters as much as architecture
  • Remove NSP, dynamic masking, more data = better results
  1. ALBERT
    • Parameter sharing dramatically reduces model size
  • Factorized embeddings for efficiency
  • 89% fewer parameters than BERT
  1. DistilBERT
    • Knowledge distillation for deployment
  • 40% smaller, 60% faster, 97% performance
  1. ELECTRA
    • Replace token detection more sample-efficient
  • Learn from all tokens, not just 15%
  1. Model Selection
    • Choose based on constraints (quality, speed, memory)
  • HuggingFace makes it easy to experiment
Winter 2026
PSYC 51.07: Models of Language and Communication

References 📚

Essential Papers:

  • Liu et al. (2019) - "RoBERTa: A Robustly Optimized BERT Pretraining Approach"
  • Lan et al. (2019) - "ALBERT: A Lite BERT for Self-supervised Learning"
  • Sanh et al. (2019) - "DistilBERT, a distilled version of BERT"
  • Clark et al. (2020) - "ELECTRA: Pre-training Text Encoders as Discriminators"
  • He et al. (2020) - "DeBERTa: Decoding-enhanced BERT with Disentangled Attention"

Resources:

  • HuggingFace Model Hub: https://huggingface.co/models
  • Papers With Code: BERT variants leaderboard
  • Model Cards: Detailed documentation for each variant
Winter 2026
PSYC 51.07: Models of Language and Communication

Questions? 🙋

Discussion Time

Topics for discussion:

  • BERT variants and improvements
  • Model selection strategies
  • Knowledge distillation
  • Parameter efficiency
  • Implementation questions

Thank you! 🙏

Next: Applications and Real-World Use Cases!

Winter 2026