* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 21: Diffusion models for text

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Explain how diffusion models generate content by iteratively refining noise into signal
  2. Describe the forward and reverse processes in continuous diffusion (foundation for Lectures 22–23)
  3. Explain how discrete diffusion adapts the framework for text by replacing Gaussian noise with masking
  4. Compare autoregressive and diffusion-based text generation and articulate their tradeoffs
  5. Build intuition for why iterative refinement is a powerful paradigm for language generation

Announcements

Assignment 4 (Customer Service Chatbot) is due today at 11:59 PM EST.

  • Final Project has been released! Due March 9, 11:59 PM EST
  • Assignment 5 (Build GPT) is now available as optional extra credit

Final project

Work in teams of 2-3(ish) to build an ambitious, open-ended project over Weeks 7-10. Explore novel applications, replicate and extend published research, build innovative systems, or evaluate LLM capabilities.

  • Weeks 7–8: Form teams and brainstorm ideas
  • Week 9: Core development and experiments
  • Week 10: Final submission + in-class presentations (March 9, 11:59 PM EST)

Why diffusion models?

Every language model we've studied so far — from ELIZA to GPT — generates text one token at a time, left to right. Diffusion models take a fundamentally different approach: start with pure noise and iteratively refine it into a coherent output, generating all positions simultaneously.

Imagine writing an essay by first typing random letters into every position, then making many editing passes — each pass fixing more of the text until a polished essay emerges. That's diffusion. It sounds absurd, but it works remarkably well — and for some tasks, it's better than left-to-right generation.

Two paradigms for generation

Autoregressive (GPT) Diffusion
Generation order Left to right, one token at a time All positions at once, refined iteratively
Analogy Speaking a sentence word by word Editing a rough draft into a polished one
Key strength Simple, proven at scale Bidirectional context, flexible editing
Key weakness Can't "go back" — early errors propagate Requires many refinement steps

When you write, you don't commit to each word in sequence. You draft, revise, restructure. Diffusion models formalize this iterative refinement process — and recent results show they can match autoregressive models at the scale of billions of parameters (LLaDA; Nie et al., 2025).

The diffusion framework: big picture

Every diffusion model has two processes:

  1. Forward process (corruption): Gradually destroy information in the data until nothing remains but noise
  2. Reverse process (generation): Learn to undo the corruption, step by step, recovering the original data from noise

The key insight from Sohl-Dickstein et al. (2015): if each corruption step is small enough, then each reversal step can be learned by a neural network.

Clean data Slightly noisy Noisier ... Pure noise
Pure noise Slightly cleaner Cleaner ... Generated output

Diffusion was born in image space

Diffusion models were originally developed for image generation (Sohl-Dickstein et al., 2015; Ho et al., 2020). The idea is intuitive with images: start with a clean picture and gradually corrupt it by adding random noise until the image is unrecognizable. Then train a neural network to reverse the process.

Forward diffusion process on a simple image

This visual intuition made diffusion wildly successful for image generation (DALL-E, Stable Diffusion, Midjourney — covered in Lectures 22–23). But how do we extend this to text, where "adding a little noise to a word" is meaningless? We'll answer this after establishing the mathematical framework.

Continuous diffusion: the foundation

For continuous data (images, audio, embeddings), the forward process adds Gaussian noise at each timestep t=1,,Tt = 1, \ldots, T:

xtN ⁣(1βt  xt1,    βtI)\mathbf{x}_t \sim \mathcal{N}\!\left(\sqrt{1 - \beta_t}\;\mathbf{x}_{t-1},\;\; \beta_t\,\mathbf{I}\right)

where βt\beta_t is a small noise amount at step tt. After T1000T \approx 1000 steps, the original signal is completely destroyed — xT\mathbf{x}_T is indistinguishable from pure Gaussian noise.

We can jump directly to any timestep. Define αˉt=s=1t(1βs)\bar\alpha_t = \prod_{s=1}^{t}(1 - \beta_s):

xt=αˉt  x0+1αˉt  ϵ,ϵN(0,I)\mathbf{x}_t = \sqrt{\bar\alpha_t}\;\mathbf{x}_0 + \sqrt{1 - \bar\alpha_t}\;\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})

This is essential for efficient training — no need to iterate through all tt steps.

Continuous diffusion: learning to reverse

A neural network ϵθ(xt,t)\boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t) is trained to predict the noise that was added at each step. The loss is simply mean squared error (Ho et al., 2020):

L=Et,x0,ϵ ⁣[ϵϵθ(xt,t)2]\mathcal{L} = \mathbb{E}_{t,\,\mathbf{x}_0,\,\boldsymbol{\epsilon}}\!\left[\left\|\boldsymbol{\epsilon} - \boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t)\right\|^2\right]

To generate, start from pure noise xTN(0,I)\mathbf{x}_T \sim \mathcal{N}(\mathbf{0}, \mathbf{I}) and iteratively subtract the predicted noise, step by step, until clean data x0\mathbf{x}_0 emerges.

Lectures 22 and 23 build on this continuous framework (latent diffusion, classifier-free guidance, DiT, Stable Diffusion, Sora). Understanding Gaussian diffusion provides the conceptual foundation — but text is not continuous. Tokens are discrete symbols. So how do we adapt diffusion for language?

The problem with text

Adding Gaussian noise to a sentence doesn't work:

  • Pixel values are continuous numbers (0–255) — small perturbations are meaningful
  • Token IDs are discrete symbols — "adding 0.1 to the token 'cat'" is meaningless
  • There's no natural metric space where "cat" is close to "car" but far from "democracy"
  1. Continuous embedding approach (Diffusion-LM; Li et al., 2022): Map tokens to continuous embeddings, run Gaussian diffusion there, then round back to tokens
  2. Discrete diffusion (D3PM; Austin et al., 2021): Replace Gaussian noise with discrete corruption — randomly changing tokens to other tokens or to [MASK]

The discrete approach turns out to be simpler, more effective, and connects beautifully to something you already know.

Discrete diffusion: replacing noise with masking

Instead of adding Gaussian noise, the forward process randomly masks tokens:

  • At each step, each unmasked token has a small probability of being replaced with [MASK]
  • As tt increases, more tokens become masked
  • At t=Tt = T: every token is masked — the sequence is pure noise (all [MASK])
  • At t=0t = 0: the original clean text
The cat sat on the mat The _ sat on _ mat _ _ sat _ _ mat _ _ _ _ _ _

Once a token becomes [MASK], it stays masked. The [MASK] state absorbs tokens — they can't escape. This is the simplest discrete corruption: there's only one type of noise, and...you've seen it before!

Does this remind you of anything?

Recall from Lecture 18: BERT trains by masking 15% of tokens and predicting the originals. Discrete diffusion does the same thing — but with a key generalization:

BERT (Lecture 18) Discrete diffusion
Mask rate Select 15%, then 80/10/10 Varies from 0% to 100% over a schedule
Prediction One-shot: predict all masks at once Iterative: unmask a few tokens at a time
Training Single forward pass per example Sample random mask rate tt, predict masked tokens
Generation Not designed for generation Built for generation: start at 100% masked, iteratively unmask

Discrete diffusion generalizes masked language modeling into a generation framework. BERT is a special case — a single-step diffusion model at a fixed noise level.

MDLM: masked diffusion language models

MDLM formalizes the connection between masked language modeling and diffusion. The key ingredients:

  1. Forward process: A continuous-time masking schedule γ(t)\gamma(t) that specifies the probability a token is masked at time t[0,1]t \in [0, 1]. At t=0t = 0, nothing is masked; at t=1t = 1, everything is.
  2. Reverse process: A transformer predicts the identity of each masked token, conditioned on all visible tokens and the current noise level tt.
  3. Training: Sample a random time tt, mask tokens according to γ(t)\gamma(t), and train the model to predict the masked tokens — exactly like BERT, but at every mask rate.

Rao-Blackwellization is a statistical technique: when estimating something, if you can compute part of the answer exactly instead of approximating it, your overall estimate becomes more precise. In diffusion training, this means replacing some approximation steps with exact calculations — leading to less noisy gradients and faster learning.

MDLM uses a Rao-Blackwellized training objective that reduces variance compared to naïve discrete diffusion. In practice, this means the model trains efficiently with the same architecture as BERT — no special modifications needed. On text benchmarks, MDLM matches autoregressive models of the same size.

How text diffusion generates

To generate text with MDLM, we reverse the masking:

  1. Start with a sequence of all [MASK] tokens (length NN)
  2. At each step, the model predicts a probability distribution over the vocabulary for each masked position
  3. Unmask a subset of positions by sampling from these distributions
  4. Repeat until all positions are unmasked
_ _ _ _ _ _ _ cat _ _ _ _ _ cat _ on _ mat The cat sat on the mat

Unlike GPT, which can only see tokens to the left, the diffusion model sees all unmasked tokens when predicting each mask — regardless of position. This means the model can use "mat" to help predict "cat" and vice versa. The generation order is not fixed — the model decides what to fill in first.

What does the model learn to do first?

When generating text via iterative unmasking, models don't unmask uniformly. They develop preferences:

  • High-confidence tokens first: Function words ("the", "is", "of") and predictable tokens tend to be unmasked early
  • Content words later: Nouns, verbs, and less predictable tokens come after the "scaffolding" is in place
  • Context-dependent tokens last: Words that depend heavily on surrounding context wait until that context is available

This mirrors how humans plan sentences — we often have the gist (content words) and structure (function words) before the exact phrasing. Diffusion models discover a similar strategy: build the scaffolding first, then fill in the details. This is fundamentally different from GPT's strict left-to-right constraint.

D3PM: the general framework

D3PM (Discrete Denoising Diffusion Probabilistic Models) provides the theoretical foundation for all discrete diffusion. Instead of Gaussian noise, D3PM uses transition matrices Qt\mathbf{Q}_t that define how tokens corrupt at each step:

q(xtxt1)=Cat(xt;  xt1Qt)q(\mathbf{x}_t \mid \mathbf{x}_{t-1}) = \text{Cat}(\mathbf{x}_t;\; \mathbf{x}_{t-1} \mathbf{Q}_t)

where Cat\text{Cat} is the categorical distribution and Qt\mathbf{Q}_t specifies the probability of each token transitioning to any other token.

Noise type How it works Intuition
Uniform Any token → any random token Like replacing letters with random ones
Absorbing Any token → [MASK] only Like erasing letters one by one
Token similarity Token → similar token Like introducing typos

MDLM uses absorbing noise because it's the simplest and most effective for text.

Scaling up: LLaDA

LLaDA asks: can discrete diffusion compete with autoregressive models at scale? They trained an 8 billion parameter masked diffusion model on 2.3 trillion tokens and found:

  • On MMLU, LLaDA-8B (66.6) matches LLaMA 3-8B (66.2)
  • On GSM8K (math reasoning), LLaDA-8B (74.0) substantially outperforms LLaMA 3-8B (52.8)
  • LLaDA handles instruction following, reasoning, and long-form generation
  • It naturally solves the reversal curse — since it doesn't generate left-to-right, it can answer "Who is A's mother?" even when trained on "B is the mother of A"

For years, the assumption was that autoregressive generation was the only way to build competitive language models. LLaDA disproves this. Diffusion-based LLMs are a viable alternative paradigm — not just a research curiosity, but a practical one at the 8B parameter scale.

The Diffusion-LM approach: embeddings as a bridge

Before discrete diffusion matured, Diffusion-LM tried a different approach: run Gaussian diffusion in continuous embedding space:

  1. Map each token to its embedding vector (e.g., 768-dim)
  2. Run standard Gaussian diffusion on these embeddings
  3. At generation time, round each denoised embedding back to the nearest vocabulary token
Diffusion-LM (continuous) MDLM / D3PM (discrete)
Noise type Gaussian in embedding space Masking or token swaps
Rounding step Required (introduces errors) Not needed
Controllability Excellent — gradient-based guidance Good — conditional masking
Scalability Limited by rounding artifacts Scales to 8B+ parameters

Discrete methods have won out for pure text generation, but continuous approaches remain valuable for controllable generation — e.g., guiding text toward specific sentiment, topics, or structural constraints.

Intuition: why does iterative refinement work?

1. The editing analogy: A first draft with blanks is easier to improve than starting from scratch. Each unmasking step gives the model more context, making the remaining predictions easier.

2. The jigsaw puzzle analogy: In a jigsaw puzzle, each piece you place constrains where other pieces go. Similarly, each unmasked token constrains the remaining tokens — the problem gets easier as you go.

3. The ensemble effect: At each step, the model makes predictions using bidirectional context from all visible tokens. Early steps leverage global structure; later steps leverage local details. This multi-scale reasoning is hard for autoregressive models.

Diffusion works because it factorizes a hard problem (generating a full sequence from nothing) into a sequence of easier problems (predicting a few tokens given most of the context). Each step is essentially a fill-in-the-blank task — something neural networks are very good at.

Practical advantages of text diffusion

Capability Autoregressive Text diffusion
Infilling: Fill in a gap mid-sentence Requires special fine-tuning Native — just mask the gap
Iterative editing: Refine parts of generated text Must regenerate from the edit point Re-mask and re-denoise locally
Length control: Generate exactly NN tokens Hard — models tend to over/under-generate Natural — initialize NN masks
Parallel decoding: Generate multiple tokens simultaneously Sequential by definition Unmask multiple positions per step
Bidirectional coherence: Ensure beginning matches end Can't look ahead Sees all unmasked positions

Text diffusion typically requires 10–100 refinement steps, each involving a full forward pass. Autoregressive generation requires NN steps (one per token) but each step is faster. For short sequences, autoregressive wins on speed. For tasks requiring bidirectional coherence or editing, diffusion wins on quality.

Current limitations

  1. Long-range coherence: Autoregressive models maintain a running context that naturally ensures consistency. Diffusion models must learn long-range dependencies through the iterative process, which can fail for very long documents.

  2. Sampling speed: Even with optimized schedules, generating text with diffusion is slower than autoregressive generation with KV-caching for most practical sequence lengths.

  3. Ecosystem maturity: Autoregressive models have years of tooling — RLHF, DPO, KV-caching, speculative decoding. Diffusion-based LLMs are catching up but lack this infrastructure.

  4. Evaluation: Perplexity (the standard LM metric) doesn't directly apply to diffusion models, making fair comparison difficult.

  • RLHF (Lecture 16): Reinforcement Learning from Human Feedback — humans rank model outputs, and the model is trained to prefer higher-ranked responses
  • DPO (Direct Preference Optimization): A simpler alternative to RLHF that skips the reward model and directly optimizes the language model on human preference pairs
  • KV-caching: During autoregressive generation, previously computed key/value vectors are stored and reused so each new token only requires one forward pass through the new position — not the entire sequence

Take-home messages

  • BERT's masked prediction training (Lecture 18) — which seemed like just a pretraining trick — turns out to be the foundation of an entire generation paradigm.
  • The idea of iterative refinement from noise is a powerful principle that extends beyond text to images, audio, and video.
  • Because the same diffusion framework applies to both continuous data (images, audio, video) and discrete data (text), it provides a natural foundation for multimodal models that generate across modalities — as we'll see in Lectures 22–23.

Further reading

Sohl-Dickstein et al. (2015, ICML) "Deep Unsupervised Learning using Nonequilibrium Thermodynamics" — The original diffusion model paper, grounded in statistical physics.

Ho, Jain & Abbeel (2020, NeurIPS) "Denoising Diffusion Probabilistic Models" — Made diffusion practical with the simplified noise-prediction objective.

Austin et al. (2021, NeurIPS) "Structured Denoising Diffusion Models in Discrete State-Spaces" — D3PM: the theoretical foundation for all discrete diffusion.

Li et al. (2022, NeurIPS) "Diffusion-LM Improves Controllable Text Generation" — Continuous embedding approach to text diffusion with gradient-based control.

Lou et al. (2024, ICML) "Discrete Diffusion Modeling by Estimating the Ratios of the Data Distribution" — SEDD: score matching for discrete spaces (Best Paper).

Sahoo et al. (2024, NeurIPS) "Simple and Effective Masked Diffusion Language Models" — MDLM: connecting BERT-style masking to diffusion.

Nie et al. (2025, arXiv) "Large Language Diffusion Models" — LLaDA: 8B-parameter diffusion LLM competitive with LLaMA 3.

Questions?

📧 Email
💬 Discord

Image diffusion and multimodal generation: latent diffusion, CLIP, and the path from DDPM to Stable Diffusion 3