* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 19: BERT variants

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Identify key limitations of the original BERT training procedure
  2. Explain how RoBERTa, ALBERT, DistilBERT, ELECTRA, and DeBERTa each address different limitations
  3. Compare parameter efficiency, training efficiency, and inference speed across variants
  4. Select the appropriate variant for a given task and resource constraint
  5. Argue whether encoder models remain relevant in the era of GPT-4

What could be improved?

Training procedure: NSP may hurt performance; static masking reuses the same masks every epoch; only 15% of tokens provide training signal.

Scale: Trained on only 3.3B words with 100K steps — modern datasets are 100× larger.

Efficiency: 110M parameters are all active for every input. Large memory footprint for deployment.

📓 Companion Notebook — try different BERT variants hands-on

RoBERTa: robustly optimized BERT

RoBERTa (Liu et al., 2019) keeps BERT's architecture but fixes the training recipe:

  1. Remove NSP — Next Sentence Prediction hurt performance. Use only MLM with full sentences.
  2. Dynamic masking — Generate a new masking pattern every time a sequence is seen, instead of reusing the same mask.
  3. Larger batches, more data — Batch size 8K sequences (vs BERT's 256), 160GB text (vs 16GB), 500K steps (vs 100K).
  4. Longer sequences — Train on longer contiguous text for better long-range understanding.

Training procedure matters as much as architecture. RoBERTa shows that BERT was significantly undertrained.

Dynamic masking

BERT used static masking — the same mask positions every epoch, risking memorization. RoBERTa generates new masks on-the-fly during training:

  • Epoch 1: "My [MASK] is cute"
  • Epoch 2: "My dog [MASK] cute"
  • Epoch 3: "My dog is [MASK]"

More diverse training signal → better generalization. Combined with removing NSP and training longer on more data, this alone accounts for most of RoBERTa's gains.

RoBERTa results

Task BERT-Large RoBERTa Improvement
SQuAD 2.0 83.1 89.4 +6.3
MNLI 86.7 90.2 +3.5
SST-2 94.9 96.4 +1.5
RACE 72.0 83.2 +11.2
  • Dynamic masking consistently outperforms static masking
  • More data + longer training = substantial gains
  • Removing NSP improves downstream task performance
  • Some tasks see enormous gains (RACE: +11.2 points!)

ALBERT: a lite BERT

ALBERT (Lan et al., 2019) dramatically reduces BERT's parameter count through two innovations:

1. Factorized embedding parameters:

  • BERT: vocabulary (30K) × hidden size (768) = 23M parameters
  • ALBERT: vocabulary (30K) × embedding size (128) + embedding (128) × hidden (768) = 3.9M parameters
  • 83% fewer embedding parameters

2. Cross-layer parameter sharing:

  • All 12 transformer layers share the same weights
  • Like running the same layer 12 times (iterative refinement)
  • 89% fewer transformer parameters

3. Sentence Order Prediction (SOP):

  • Replaces NSP with a harder task: are sentences A, B in correct order or swapped?
  • Forces the model to learn discourse coherence, not just topic matching

ALBERT parameter efficiency

Model Layers Hidden size Parameters
BERT-base 12 768 110M
ALBERT-base 12 768 12M
ALBERT-large 24 1024 18M
ALBERT-xlarge 24 2048 60M
ALBERT-xxlarge 12 4096 235M

1import torch.nn as nn
2
3# BERT: direct embedding (30K vocab × 768 hidden = 23M params)
4bert_embed = nn.Embedding(30000, 768)
5
6# ALBERT: two-step embedding (30K × 128 + 128 × 768 = 3.9M params)
7albert_embed = nn.Embedding(30000, 128)
8albert_project = nn.Linear(128, 768)
continued...

ALBERT parameter efficiency


9# 83% fewer embedding parameters!
...continued

Cross-layer parameter sharing


1class BERT:
2    def __init__(self):
3        # Each layer has unique parameters
4        self.layers = [TransformerLayer() for _ in range(12)]
5        # 12 × 7M params = 85M params in transformer layers
6
7    def forward(self, x):
8        for layer in self.layers:
9            x = layer(x)   # Different weights each time
10        return x
11
12class ALBERT:
13    def __init__(self):
14        # Single shared layer
15        self.shared_layer = TransformerLayer()
continued...

Cross-layer parameter sharing


16        # 1 × 7M params = 7M params (89% reduction!)
17
18    def forward(self, x):
19        for _ in range(12):
20            x = self.shared_layer(x)  # Same weights reused
21        return x
...continued

ALBERT has 89% fewer parameters but the same compute cost — it still performs 12 forward passes through the transformer layer. The savings are in memory, not speed.

DistilBERT: knowledge distillation

Knowledge distillation (Sanh et al., 2019) compresses BERT into a smaller, faster model:

  • Teacher: Full BERT-base (12 layers, frozen)
  • Student: DistilBERT (6 layers, trainable)
  • Training signal: Student learns to match the teacher's soft probability distributions, not just the hard labels

The teacher's "wrong" predictions contain useful information — e.g., predicting "cat" is more likely than "car" for a masked animal slot tells the student about semantic similarity.

Metric BERT-base DistilBERT Change
Parameters 110M 66M 40% smaller
Inference speed 1.6× 60% faster
GLUE score 79.6 77.0 97% retained

DistilBERT training


1teacher = BertModel.from_pretrained("bert-base")  # 12 layers, frozen
2student = DistilBertModel(num_layers=6)            # 6 layers, trainable
3
4for batch in training_data:
5    # Teacher provides "soft targets" (probability distributions)
6    with torch.no_grad():
7        teacher_logits = teacher(batch)  # e.g., [0.7, 0.2, 0.1, ...]
8
9    # Student tries to match teacher's distribution
10    student_logits = student(batch)
11
12    # Distillation loss: KL divergence between soft distributions
13    # Temperature T=2 softens the distribution (more informative)
continued...

DistilBERT training


14    loss_distill = KL_divergence(
15        softmax(student_logits / T),
16        softmax(teacher_logits / T)
17    )
18    loss_mlm = masked_lm_loss(student_logits, labels)
19
20    loss = 0.5 * loss_distill + 0.5 * loss_mlm
...continued

ELECTRA: efficient learning from all tokens

ELECTRA (Clark et al., 2020) uses a generator-discriminator setup:

  1. A small generator (like a mini-BERT) fills in masked tokens with plausible replacements
  2. A discriminator classifies every token as original or replaced
  3. The discriminator provides a training signal for all tokens — not just the 15% that were masked

1sentence  = "The chef cooked a delicious meal"
2masked    = "The chef [MASK] a delicious meal"
3
4# Small generator fills in the mask
5generator_output = generator(masked)  # Predicts: "ate" (plausible but wrong)
6corrupted = "The chef ate a delicious meal"
7
8# Discriminator classifies EVERY token: original or replaced?
continued...

ELECTRA: efficient learning from all tokens


9discriminator(corrupted)
10# Output: [orig, orig, REPLACED, orig, orig, orig]
11# Loss computed on ALL 6 tokens (not just 1 masked token!)
...continued

ELECTRA efficiency

ELECTRA learns from 100% of tokens (every position gets a real/replaced label), compared to BERT's 15%. This 6.7× increase in training signal means ELECTRA reaches BERT-level performance with 4× less compute.

ELECTRA is ideal when you have limited compute budget:

  • ELECTRA-Small outperforms BERT-Small
  • ELECTRA-Base is competitive with BERT-Large
  • With the same compute budget, ELECTRA consistently wins

Variant comparison summary

Model Key innovation Best for
BERT MLM + NSP Baseline, well-understood
RoBERTa Better training recipe Maximum quality
ALBERT Parameter sharing Memory-constrained deployment
DistilBERT Knowledge distillation Speed-critical production
ELECTRA Replaced token detection Limited training budget

General guidelines:

  • Best quality: RoBERTa-Large
  • Best efficiency: DistilBERT
  • Limited memory: ALBERT
  • Limited training budget: ELECTRA
  • Good default: RoBERTa-Base or BERT-Base

Other notable BERT variants

DeBERTa (Microsoft, 2020): Disentangled attention separates content and position representations. Enhanced mask decoder. State-of-the-art on SuperGLUE.

SpanBERT (Facebook, 2019): Masks random contiguous spans instead of individual tokens. Span boundary objective. Better for extractive tasks (QA, coreference).

ERNIE (Baidu, 2019): Entity-level and phrase-level masking. Knowledge-enhanced pre-training. Strong on Chinese NLP tasks.

BART (Facebook, 2019): Encoder-decoder architecture (not encoder-only). Denoising autoencoder with various corruption strategies. Excellent for generation tasks.

Model selection guide

Step 1: What are your constraints?

  • Need maximum quality? → RoBERTa-Large
  • Need fast inference? → DistilBERT
  • Need small memory footprint? → ALBERT
  • Limited training compute? → ELECTRA
  • Need text generation? → Consider BART or GPT instead

Step 2: Consider your domain

  • Biomedical text? → BioBERT, PubMedBERT
  • Scientific text? → SciBERT
  • Legal text? → LegalBERT
  • Multilingual? → mBERT, XLM-RoBERTa

Step 3: Start simple, iterate

  • Begin with bert-base-uncased as a baseline
  • Try RoBERTa-base for an easy quality boost
  • Optimize for speed/memory only if needed

Using different variants with HuggingFace


1from transformers import AutoModel, AutoTokenizer
2
3# All variants share the same API — just change the model name!
4
5# BERT
6model = AutoModel.from_pretrained("bert-base-uncased")
7tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
8
9# RoBERTa
10model = AutoModel.from_pretrained("roberta-base")
11tokenizer = AutoTokenizer.from_pretrained("roberta-base")
12
13# ALBERT
continued...

Using different variants with HuggingFace


14model = AutoModel.from_pretrained("albert-base-v2")
15tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
16
17# DistilBERT
18model = AutoModel.from_pretrained("distilbert-base-uncased")
19tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
20
21# ELECTRA
22model = AutoModel.from_pretrained("google/electra-base-discriminator")
23tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
...continued

Benchmarking variants


1import time
2from transformers import pipeline
3
4models = {
5    "bert-base": "textattack/bert-base-uncased-SST-2",
6    "distilbert": "distilbert-base-uncased-finetuned-sst-2-english",
7    "albert": "textattack/albert-base-v2-SST-2",
8}
9test_texts = ["This movie was fantastic!", "I hated every minute."] * 100
10
11for name, model_id in models.items():
12    pipe = pipeline("sentiment-analysis", model=model_id)
13    start = time.time()
14    results = pipe(test_texts)
15    elapsed = time.time() - start
16    print(f"{name}: {elapsed:.2f}s ({200/elapsed:.0f} samples/sec)")
Model Accuracy Speed (samples/sec) Memory
bert-base 93.2% ~45 420MB
distilbert 91.3% ~85 250MB
albert 92.7% ~38 45MB

Architecture evolution

Model Year Key innovation Quality vs BERT
BERT 2018 MLM + NSP Baseline
RoBERTa 2019 Better training recipe +3–11 pts
ALBERT 2019 Parameter sharing 89% fewer params
DistilBERT 2019 Knowledge distillation 97% quality, 60% faster
ELECTRA 2020 Learn from all tokens 4× less compute
DeBERTa 2020 Disentangled attention SOTA on SuperGLUE
ModernBERT 2024 Modern training + RoPE + Flash Attention SOTA on GLUE, retrieval

ModernBERT deep dive

ModernBERT applies 6 years of decoder innovations to the encoder architecture:

Innovation BERT (2018) ModernBERT (2024)
Position encoding Learned absolute (512 max) RoPE (8,192 tokens)
Attention Full quadratic Flash Attention + alternating global/local
Padding Processes pad tokens Unpadding (only real tokens)
Training data 3.3B words 2 trillion tokens (600× more)
Code understanding None Trained on code corpora

SOTA on GLUE, retrieval (MTEB), and code understanding benchmarks. Available as answerdotai/ModernBERT-base and answerdotai/ModernBERT-large on HuggingFace. Proves that encoder architecture still has room to grow when given modern training techniques.

Gemma Encoder and the encoder renaissance

Gemma Encoder (2025): Google's first encoder-only model since BERT, built by repurposing Gemma decoder weights for bidirectional encoding. Competitive with ModernBERT on sentence embedding tasks.

If Google — a company betting heavily on decoder-only models (Gemini) — still releases an encoder model, it signals that encoders serve a purpose decoders can't efficiently fill. The encoder isn't dead; it's being modernized.

Production deployment patterns

Optimization Speedup Memory savings Quality loss
ONNX Runtime 2–5× Moderate None
TensorRT (NVIDIA) 5–10× Moderate None
INT8 quantization 2–4× 4× smaller <1%
Distillation (→ DistilBERT) 1.6× 40% smaller ~3%
Full pipeline (distill → quantize) 10–20× 8× smaller ~4%

DistilBERT + INT8 quantization handles classification at ~$0.001/M tokens and ~2ms per request. GPT-4 costs ~$30/M tokens at ~500ms per request. For high-volume single-task workloads, optimized encoders are 30,000× cheaper and 250× faster.

Is the encoder dead?

The case for "yes, encoders are obsolete":

  • Decoder-only models (GPT-4, Claude) can do classification, NER, QA via prompting
  • One model for all tasks vs. fine-tuning separate models
  • In-context learning eliminates the need for task-specific architectures

The case for "no, encoders still matter":

  • ModernBERT (2024) achieves SOTA on retrieval and classification — faster and cheaper than any decoder
  • Gemma Encoder (2025) proves Google still sees value in the architecture
  • Encoders are 10–100× cheaper to run than decoder models for classification tasks
  • Most production search and retrieval systems still use encoders (Sentence-BERT, E5, NV-Embed)

The real answer: It depends on your constraints. Encoders win on cost and latency. Decoders win on flexibility.

Discussion

  1. Training vs architecture: RoBERTa shows that training matters enormously. How much of BERT's "limitations" were really just undertrained models?

  2. The distillation paradox: Why does a student model learn better from soft probability distributions than from hard labels? What "dark knowledge" is in the teacher's mistakes?

  3. Encoders in 2026: Google, Hugging Face, and others are still releasing encoder models. If decoders can do everything, why? What does this tell us about the efficiency-flexibility tradeoff?

  4. Parameter sharing (ALBERT): All 12 layers share the same weights and it still works. What does this imply about what transformer layers actually learn?

Further reading

Liu et al. (2019, arXiv) "RoBERTa: A Robustly Optimized BERT Pretraining Approach" — Better training recipe, same architecture.

Lan et al. (2019, ICLR) "ALBERT: A Lite BERT" — 89% parameter reduction via sharing.

Sanh et al. (2019, NeurIPS Workshop) "DistilBERT" — Knowledge distillation for 60% speedup.

Clark et al. (2020, ICLR) "ELECTRA" — Learn from all tokens, not just masked ones.

He et al. (2020, ICLR) "DeBERTa" — Disentangled attention, SOTA on SuperGLUE.

Warner et al. (2024, arXiv) "ModernBERT" — Modern encoder with RoPE + Flash Attention.

Google (2025, arXiv) "Gemma Encoder" — Encoder-only Gemma for sentence embeddings.

Questions?

📧 Email
💬 Discord

Applications of encoder models: industry, neuroscience, and the "understanding" debate