* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 19: BERT variants

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Identify key limitations of the original BERT training procedure
  2. Explain how RoBERTa, ALBERT, DistilBERT, ELECTRA, and ModernBERT each address different limitations
  3. Compare parameter efficiency, training efficiency, and inference speed across variants
  4. Select the appropriate variant for a given task and resource constraint
  5. Argue whether encoder models remain relevant in the era of GPT-style decoders

Play around with different BERT variants in our companion Notebook!

What could be improved?

Training procedure: NSP may hurt performance; static masking reuses the same masks every epoch; only 15% of tokens provide training signal (recall the 80/10/10 MLM procedure).

Scale: Trained on only 3.3B words with 100K steps — modern datasets are 100× larger (and often train for much longer).

Efficiency: 110M parameters are all active for every input. Large memory footprint for deployment.

RoBERTa: robustly optimized BERT

RoBERTa (Liu et al., 2019) keeps BERT's architecture but fixes the training recipe:

  1. Remove NSP — Next Sentence Prediction hurt performance. Use only MLM with full sentences.
  2. Dynamic masking — Generate a new masking pattern every time a sequence is seen, instead of reusing the same mask.
  3. Larger batches, more data — Batch size 8K sequences (vs BERT's 256), 160GB text (vs 16GB), 500K steps (vs 100K).
  4. Longer sequences — Train on longer contiguous text for better long-range understanding.

Training procedure matters as much as architecture. RoBERTa shows that BERT was significantly undertrained.

Dynamic masking

BERT used static masking — the same mask positions every epoch, risking memorization. RoBERTa generates new masks on-the-fly during training:

  • Epoch 1: "My [MASK] is cute"
  • Epoch 2: "My dog [MASK] cute"
  • Epoch 3: "My dog is [MASK]"

More diverse training signal → better generalization. Combined with removing NSP and training longer on more data, this alone accounts for most of RoBERTa's gains.

RoBERTa results

Task (metric) BERT-Large RoBERTa Improvement
SQuAD 2.0 (F1) 83.1 89.4 +6.3
MNLI (accuracy) 86.7 90.2 +3.5
SST-2 (accuracy) 94.9 96.4 +1.5
RACE (accuracy) 72.0 83.2 +11.2

SQuAD 2.0: reading comprehension + unanswerable questions (human F1: 89.5 — RoBERTa essentially matches humans). MNLI: natural language inference across 10 genres (human: 92.0%, random: 33.3%). SST-2: binary sentiment classification (human: ~97%, random: 50%). RACE: multiple-choice reading comprehension from English exams (human: 92.2%, random: 25%). The RACE gain (+11.2 pts) is especially striking because it requires multi-sentence reasoning — exactly the capability that better training unlocks.

  • Dynamic masking consistently outperforms static masking
  • More data + longer training = substantial gains
  • Removing NSP improves downstream task performance
  • Some tasks see enormous gains (RACE: +11.2 points!)

ALBERT: a lite BERT

ALBERT (Lan et al., 2019) dramatically reduces BERT's parameter count through three innovations:

1. Factorized embedding parameters:

  • BERT: vocabulary (30K) × hidden size (768) = 23M parameters
  • ALBERT: vocabulary (30K) × embedding size (128) + embedding (128) × hidden (768) = 3.9M parameters
  • 83% fewer embedding parameters

2. Cross-layer parameter sharing:

  • All 12 transformer layers share the same weights
  • Like running the same layer 12 times (iterative refinement)
  • 89% fewer transformer parameters

3. Sentence Order Prediction (SOP):

  • Replaces NSP with a harder task: are sentences A, B in the correct order?
  • Forces the model to learn discourse coherence, not just topic matching

Factorized embedding parameters

BERT embeds each token directly into the hidden dimension C=768C = 768. ALBERT introduces a bottleneck E=128CE = 128 \ll C:

V×CBERT: 30K×768V×E30K×128  ×  E×C128×768\underbrace{V \times C}_{\text{BERT: 30K} \times \text{768}} \quad\longrightarrow\quad \underbrace{V \times E}_{\text{30K} \times \text{128}} \;\times\; \underbrace{E \times C}_{\text{128} \times \text{768}}

This factorization applies to the token embeddings, which dominate the parameter count. Positional (512×E512 \times E) and segment (2×E2 \times E) embeddings also use EE, but they're tiny. All three are summed in EE-space, then projected to CC with a single linear layer.

The vocabulary matrix is low-rank — tokens cluster into a much smaller subspace than C=768C = 768. Embeddings only need to encode token identity; the projection layer handles the mapping to hidden space. Result: 83% fewer embedding parameters (23M → 3.9M).

ALBERT parameter efficiency

Model Layers Hidden size Parameters
BERT-base 12 768 110M
ALBERT-base 12 768 12M
ALBERT-large 24 1024 18M
ALBERT-xlarge 24 2048 60M
ALBERT-xxlarge 12 4096 235M

Factorized embedding in Python


1import torch.nn as nn
2
3# BERT: direct embedding (30K vocab × 768 hidden = 23M params)
4bert_embed = nn.Embedding(30000, 768)
5
6# ALBERT: two-step embedding (30K × 128 + 128 × 768 = 3.9M params)
7albert_embed = nn.Embedding(30000, 128)
8albert_project = nn.Linear(128, 768)
9# 83% fewer embedding parameters!

Cross-layer parameter sharing: BERT vs ALBERT


1class BERT:
2    def __init__(self):
3        # Each layer has unique parameters
4        self.layers = [TransformerLayer() for _ in range(12)]
5        # 12 × 7M params = 85M params in transformer layers
6
7    def forward(self, x):
8        for layer in self.layers:
9            x = layer(x)   # Different weights each time
10        return x

Cross-layer parameter sharing: BERT vs ALBERT


1class ALBERT:
2    def __init__(self):
3        # Single shared layer
4        self.shared_layer = TransformerLayer()
5        # 1 × 7M params = 7M params (89% reduction!)
6
7    def forward(self, x):
8        for _ in range(12):
9            x = self.shared_layer(x)  # Same weights reused
10        return x

ALBERT has 89% fewer parameters but the same compute cost — it still performs 12 forward passes through the transformer layer. The savings are in memory, not speed.

DistilBERT: knowledge distillation

Knowledge distillation (Sanh et al., 2019) compresses BERT into a smaller, faster model:

  • Teacher: Full BERT-base (12 layers, frozen)
  • Student: DistilBERT (6 layers, trainable)
  • Training signal: Student learns to match the teacher's soft probability distributions, not just the hard labels

The teacher's "wrong" predictions contain useful information — e.g., predicting "cat" is more likely than "car" for a masked animal slot tells the student about semantic similarity.

Metric BERT-base DistilBERT Change
Parameters 110M 66M 40% smaller
Inference speed 1.6× 60% faster
GLUE score 79.6 77.0 97% retained

GLUE (General Language Understanding Evaluation) is a benchmark suite of 9 tasks — including MNLI, SST-2, and others — that tests grammar, sentiment, similarity, and inference. The score is an average across all tasks; human baseline is ~87.

DistilBERT training in Python


1teacher = BertModel.from_pretrained("bert-base")  # 12 layers, frozen
2student = DistilBertModel(num_layers=6)            # 6 layers, trainable
3
4for batch in training_data:
5    # Teacher provides "soft targets" (probability distributions)
6    with torch.no_grad():
7        teacher_logits = teacher(batch)  # e.g., [0.7, 0.2, 0.1, ...]
8
9    # Student tries to match teacher's distribution
10    student_logits = student(batch)
11
12    # Distillation loss: KL divergence between soft distributions
13    # Temperature T=2 softens the distribution (more informative)
14    loss_distill = KL_divergence(
15        softmax(student_logits / T),
16        softmax(teacher_logits / T)
17    )
18    loss_mlm = masked_lm_loss(student_logits, labels)
19
20    loss = 0.5 * loss_distill + 0.5 * loss_mlm

ELECTRA: efficient learning from all tokens

ELECTRA (Clark et al., 2020) uses a generator-discriminator setup:

  1. A small generator (like a mini-BERT) fills in masked tokens with plausible replacements
  2. A discriminator classifies every token as original or replaced
  3. The discriminator provides a training signal for all tokens — not just the 15% that were masked

BERT's MLM replaces tokens with [MASK] — an obvious tell that never appears in real text. The discriminator's task is harder: the generator produces plausible substitutions ("ate" for "cooked"), so the discriminator must understand the full context deeply enough to detect subtle semantic mismatches. This is closer to how humans process language — we don't spot blank slots, we notice when something doesn't quite fit.

ELECTRA in Python


1import torch
2from transformers import AutoTokenizer, AutoModelForMaskedLM, ElectraForPreTraining
3
4tokenizer = AutoTokenizer.from_pretrained("google/electra-small-generator")
5generator = AutoModelForMaskedLM.from_pretrained("google/electra-small-generator")
6discriminator = ElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
7
8# Step 1: Mask a token and let the generator fill it in
9original = "The chef cooked a delicious meal"
10masked = "The chef [MASK] a delicious meal"
11inputs = tokenizer(masked, return_tensors="pt")
12
13with torch.no_grad():
14    gen_logits = generator(**inputs).logits
15
16mask_idx = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
17predicted_id = gen_logits[0, mask_idx].argmax(dim=-1)
18replacement = tokenizer.decode(predicted_id)
19print(f"Generator filled [MASK] → '{replacement}'")  # e.g., "prepared"
20
21# Step 2: Discriminator classifies EVERY token as original or replaced
22fake_ids = inputs.input_ids.clone()
23fake_ids[0, mask_idx] = predicted_id
24with torch.no_grad():
25    disc_logits = discriminator(fake_ids).logits
26
27predictions = (disc_logits.squeeze() > 0).long()  # positive logit = "fake"
28tokens = tokenizer.convert_ids_to_tokens(fake_ids[0])
29for tok, pred in zip(tokens, predictions):
30    print(f"  {tok:12s}{'REPLACED' if pred else 'original'}")

ELECTRA efficiency

ELECTRA learns from 100% of tokens (every position gets a real/replaced label), compared to BERT's 15%. This 6.7× increase in training signal means ELECTRA reaches BERT-level performance with 4× less compute.

ELECTRA is ideal when you have limited compute budget:

  • ELECTRA-Small outperforms BERT-Small
  • ELECTRA-Base is competitive with BERT-Large
  • With the same compute budget, ELECTRA consistently wins

ModernBERT: 6 years of decoder tricks, applied to encoders

ModernBERT (Warner et al., 2024) asks: what if we rebuilt BERT from scratch using everything we've learned from training decoders?

Component BERT (2018) ModernBERT (2024)
Position encoding Learned absolute (512 max) RoPE (8,192 tokens)
Attention Full quadratic Flash Attention + alternating global/local
Padding Processes pad tokens Unpadding (only real tokens)
Training data 3.3B words 2 trillion tokens (600×)
Code understanding None Trained on code corpora

SOTA on GLUE, retrieval (MTEB), and code understanding. Available as answerdotai/ModernBERT-base and answerdotai/ModernBERT-large. Proves that the encoder architecture still has room to grow.

ModernBERT key innovations

Instead of learning a fixed position embedding for each slot (BERT's approach, capped at 512 tokens), RoPE encodes position by rotating the query and key vectors in attention. Relative distances are captured by the angle between rotated vectors. This generalizes to sequences longer than training length — ModernBERT handles 8,192 tokens vs BERT's 512.

Standard attention materializes the full T×TT \times T attention matrix in GPU memory (O(T2)O(T^2) space). Flash Attention computes attention tile-by-tile in fast on-chip SRAM, never storing the full matrix. Same exact result, but ~2–4× faster and uses O(T)O(T) memory. ModernBERT alternates between global attention (every token sees every token) and local attention (sliding window) across layers.

Batched inputs require padding shorter sequences to the same length. BERT wastes compute processing these pad tokens through every layer. Unpadding strips pad tokens before the transformer and reinserts them after, so the model only processes real tokens. In batches with variable-length inputs, this can save 20–30% of total compute.

Variant comparison summary

Model Key innovation Best for
BERT MLM + NSP Baseline, well-understood
RoBERTa Better training recipe Maximum quality (classic)
ALBERT Parameter sharing Memory-constrained deployment
DistilBERT Knowledge distillation Speed-critical production
ELECTRA Replaced token detection Limited training budget
ModernBERT Modern training + RoPE + Flash Attention Maximum quality (2024)
  • Best quality (2024): ModernBERT-Large
  • Best efficiency: DistilBERT
  • Limited memory: ALBERT
  • Limited training budget: ELECTRA
  • Good default: RoBERTa-Base or ModernBERT-Base

Other notable BERT variants

DeBERTa (Microsoft, 2020): Disentangled attention separates content and position representations. Enhanced mask decoder. State-of-the-art on SuperGLUE.

SpanBERT (Facebook, 2019): Masks random contiguous spans instead of individual tokens. Span boundary objective. Better for extractive tasks (QA, coreference).

ERNIE (Baidu, 2019): Entity-level and phrase-level masking. Knowledge-enhanced pre-training. Strong on Chinese NLP tasks.

BART (Facebook, 2019): Encoder-decoder architecture (not encoder-only). Denoising autoencoder with various corruption strategies. Excellent for generation tasks.

The case against encoders

2020 — In-context learning: GPT-3 (Brown et al.) showed that a single decoder model can perform classification, NLI, and QA via prompting — tasks that previously required fine-tuning separate BERT models for each.

2023 — Decoders match fine-tuned encoders: GPT-4 (OpenAI) matched or exceeded fine-tuned BERT/RoBERTa on many NLU benchmarks without any task-specific training.

2024 — Decoders do retrieval too: GritLM (Muennighoff et al.) demonstrated a single decoder model that handles both generation and embedding at SOTA levels — the last domain where encoders had a clear advantage.

Why fine-tune six BERT models for six tasks when one decoder does them all via prompting?

Gemma Encoder and the encoder renaissance

Gemma (Google, 2024) is a family of open-weight decoder-only language models (2B and 7B parameters) built from the same research behind Gemini. Outperforms similarly sized open models on 11/18 text benchmarks.

Gemma Encoder (2025) repurposes Gemma's decoder weights for bidirectional encoding — Google's first encoder-only model since BERT. Competitive with ModernBERT on sentence embedding tasks.

If Google — a company betting heavily on decoder-only models (Gemini) — still releases an encoder model, it signals that encoders serve a purpose decoders can't efficiently fill. The encoder isn't dead; it's being modernized.

Questions?

📧 Email
💬 Discord

Encoder models in the real world — applications, brain-model convergence, and what it all means for language and society

split: 25

split: 30