* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 22: Image diffusion and multimodal generation

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Explain how latent diffusion compresses computation via a VAE to enable high-resolution image generation
  2. Describe how CLIP creates a shared embedding space for text and images
  3. Explain how cross-attention and classifier-free guidance enable text-controlled image generation
  4. Compare the U-Net and Transformer (DiT) backbones for diffusion models
  5. Generate images from text prompts using the HuggingFace diffusers library

From text to images

In Lecture 21, we saw how diffusion models generate text by iteratively unmasking tokens (discrete diffusion). Today we turn to the domain where diffusion was born — images — and ask: how do we generate high-resolution images from text prompts?

Challenge Solution (this lecture)
Images are huge (786K+ values) Latent diffusion — compress first, diffuse in latent space
Text must control generation CLIP + cross-attention — bind words to spatial regions
Output must match the prompt Classifier-free guidance — amplify text alignment
Architecture must scale U-Net or DiT — backbones for the denoising network

The pixel problem

The DDPM framework (Lecture 21) can be applied to images by adding Gaussian noise to pixels and learning to reverse the process. But for a 512×512 RGB image, the denoising network must process 512×512×3=786,432512 \times 512 \times 3 = 786{,}432 values at every step — far too expensive for high-resolution generation.

The U-Net (Ronneberger et al., 2015) is an encoder-decoder neural network with skip connections, originally designed for image segmentation. It became the default backbone for diffusion models — we'll cover it in detail later this lecture.

Resolution Pixels Memory Time per image
64 × 64 12,288 ~2 GB ~30 seconds
256 × 256 196,608 ~8 GB ~5 minutes
512 × 512 786,432 ~32 GB ~20 minutes

The solution: don't run diffusion in pixel space. Run it in a compressed latent space.

Latent diffusion

Latent diffusion separates image generation into two stages:

  1. Compression: A pretrained variational autoencoder (VAE) encodes images into a compact latent space (typically 8× spatial compression)
  2. Generation: The diffusion process operates entirely in the latent space before decoding back to pixels

The VAE is described on the next slide.

Latent diffusion pipeline

Rombach et al. (2022, CVPR) "High-resolution image synthesis with latent diffusion models" — The paper that introduced latent diffusion and enabled Stable Diffusion.

Variational autoencoder (VAE)

A VAE learns to compress images into a low-dimensional latent representation and reconstruct them:

  1. Encoder: Maps image xR512×512×3\mathbf{x} \in \mathbb{R}^{512 \times 512 \times 3} to mean μ\boldsymbol{\mu} and standard deviation σ\boldsymbol{\sigma} of a latent distribution
  • μ\boldsymbol{\mu} represents the "average" latent representation for the image
  • σ\boldsymbol{\sigma} captures uncertainty — how much the latent can vary while still reconstructing the image
  1. Sampling: Draw zN(μ,σ2)\mathbf{z} \sim \mathcal{N}(\boldsymbol{\mu}, \boldsymbol{\sigma}^2) — a latent vector in R64×64×4\mathbb{R}^{64 \times 64 \times 4}
  2. Decoder: Reconstructs the image from z\mathbf{z}

VAE architecture

The VAE learns to preserve perceptually important information while discarding redundant detail. Diffusion in latent space is ~50× cheaper than in pixel space, with negligible quality loss. This single insight enabled Stable Diffusion — the first open-source, consumer-GPU image generator.

How convolutions compress images

A convolution slides a small filter (kernel) across an image, computing a weighted sum at each position. With stride 2, the filter moves 2 pixels at a time, halving the spatial dimensions:

  • 8×8 input → 3×3 conv, stride 2 → 4×4 output (4× fewer values)
  • 4×4 → 3×3 conv, stride 2 → 2×2 (4× fewer again)

Stacking convolution layers creates a hierarchy: pixels → edges → textures → objects.

Convolution downsampling

In practice, kernels are learned, not fixed. Early layers detect edges and gradients; deeper layers compose these into textures and shapes. The averaging kernel above is simplified — real networks learn what to compress. This hierarchy (pixels → edges → textures → objects) is exactly what the VAE encoder learns.

The VAE bottleneck

Property Value
Input resolution 512 × 512 × 3
Latent resolution 64 × 64 × 4
Compression ratio 48× (786K → 16K values)
Reconstruction quality Near-lossless for natural images

The VAE is trained once and frozen. The diffusion model only ever sees latent representations — it never touches pixels during training or sampling.

  • Reconstruction loss: Ensure decoded image matches the original
  • KL divergence: Regularize the latent space so it's smooth and continuous
  • Perceptual loss: Compare features extracted by a pretrained network (e.g., VGG), not raw pixels — so the model prioritizes visual similarity over pixel-exact matching
  • Adversarial loss: A discriminator network tries to distinguish real from reconstructed images, pushing the decoder toward photorealistic outputs

The Kullback-Leibler (KL) divergence measures how one probability distribution QQ diverges from a reference distribution PP:

DKL(QP)=Q(z)logQ(z)P(z)dzD_{KL}(Q \| P) = \int Q(z) \log \frac{Q(z)}{P(z)} dz

Intuitively, it quantifies how much information is lost when QQ is used to approximate PP. In VAEs, we want the learned latent distribution to be close to a simple prior (e.g., standard normal) to ensure smoothness and generalization.

CLIP: connecting text and images

CLIP learns a shared embedding space for text and images. It was trained on 400 million image-text pairs from the internet:

  1. Image encoder (Vision Transformer): Maps an image to a vector
  2. Text encoder (Transformer): Maps a caption to a vector
  3. Contrastive training: Matching pairs are pulled close; non-matching pairs are pushed apart

CLIP provides the "language understanding" for text-to-image systems. When you type "a cat wearing a hat," CLIP's text encoder converts it into an embedding that captures the visual meaning — not just the linguistic meaning. This embedding then guides diffusion via cross-attention.

Radford et al. (2021, ICML) "Learning transferable visual models from natural language supervision" — Connection to Lecture 11: CLIP extends the idea of learned embeddings (Word2Vec, GloVe) to a joint text-image space.

Text conditioning with cross-attention

To generate images from text, the denoising backbone adds cross-attention layers (recall attention from Lecture 15). At each layer:

  1. The text prompt is encoded by CLIP into a sequence of token embeddings
  2. The image features at that layer serve as queries (Q)
  3. The text embeddings serve as keys (K) and values (V)
  4. Cross-attention allows each spatial location in the image to attend to relevant words

Cross-attention mechanism

The word "cat" activates high attention weights in the spatial region where the cat is generated. The word "hat" activates weights near the top of the cat region. This spatial-linguistic binding is learned entirely from image-caption pairs.

Classifier-free guidance

Classifier-free guidance (CFG) improves alignment between text and generated images. During training, the text condition is randomly dropped some fraction of the time. At inference:

ϵ~=ϵθ(xt,t,)unconditional+w(ϵθ(xt,t,c)conditionalϵθ(xt,t,)unconditional)\tilde{\boldsymbol{\epsilon}} = \underbrace{\boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t, \varnothing)}_{\text{unconditional}} + w \cdot \Big(\underbrace{\boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t, c)}_{\text{conditional}} - \underbrace{\boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t, \varnothing)}_{\text{unconditional}}\Big)

  • ϵθ\boldsymbol{\epsilon}_\theta — the denoising network (U-Net or DiT), parameterized by θ\theta
  • xt\mathbf{x}_t — the noisy latent at timestep tt
  • cc — the text condition (CLIP embedding of the prompt)
  • \varnothingno text (empty prompt, i.e., unconditional)
  • Unconditional ϵθ(xt,t,)\boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t, \varnothing) — what the model predicts without any text guidance
  • Conditional ϵθ(xt,t,c)\boldsymbol{\epsilon}_\theta(\mathbf{x}_t, t, c) — what it predicts guided by the prompt
  • ww — the guidance scale (typically 7–15): how much to amplify the text signal

Think of CFG as asking: "What's different about images that match this prompt versus random images?" Then amplifying that difference. The model learns both what a "dog on a beach" looks like and what a random image looks like — guidance amplifies the gap. See the next slide for examples.

Ho & Salimans (2022) "Classifier-free diffusion guidance" — Used in virtually all modern text-to-image systems.

The guidance scale tradeoff

Classifier-free guidance spectrum

Nearly every text-to-image system uses CFG. Stable Diffusion defaults to w=7.5w = 7.5; DALL-E 2 uses w4w \approx 4. The range of 7–10 balances prompt fidelity against visual quality — pushing higher creates oversaturated, artifact-prone images.

Try it: varying the guidance scale


1from diffusers import StableDiffusionPipeline
2import torch
3
4pipe = StableDiffusionPipeline.from_pretrained(
5    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
6).to("cuda")
7
8for scale in [1.0, 7.5, 20.0]:
9    img = pipe("a cat in a hat", guidance_scale=scale).images[0]
10    img.save(f"guidance_{scale}.png")
  • scale = 1.0: Effectively no guidance — diverse but often off-prompt
  • scale = 7.5: Default sweet spot — faithful to prompt with natural variation
  • scale = 20.0: Over-guided — saturated colors, sharp artifacts, but very literal

U-Net: the original diffusion backbone

The U-Net is the default backbone for diffusion models from DDPM through Stable Diffusion 1–2. Its U-shaped design provides multi-scale reasoning:

  1. Encoder (downsampling): Reduces spatial resolution while increasing channels — captures global context
  2. Bottleneck: Lowest resolution, largest receptive field — reasons about overall composition
  3. Decoder (upsampling): Restores spatial resolution — generates fine details
  4. Skip connections: Concatenate encoder features directly to the matching decoder layer — without these, the decoder must reconstruct spatial detail from the bottleneck alone, losing fine texture and edges

U-Net architecture

Ronneberger et al. (2015, MICCAI) "U-Net: convolutional networks for biomedical image segmentation" — Originally for medical imaging, now the workhorse of diffusion.

Diffusion Transformer (DiT)

The Diffusion Transformer (DiT) replaces the U-Net with a standard Vision Transformer:

  1. Patchify: Divide the noisy latent into non-overlapping patches (e.g., 2×2)
  2. Flatten: Treat patches as a sequence of tokens (just like ViT treats image patches — Lecture 15)
  3. Process: Apply standard Transformer blocks with self-attention
  4. Unpatchify: Reshape back to spatial dimensions to get the predicted noise

DiT pipeline

  • Vision Transformer (ViT): A Transformer that treats an image as a sequence of patches (like tokens in text) — no convolutions needed
  • Noisy latent: The VAE-compressed image with Gaussian noise added at timestep tt — this is the input the denoising network must "clean up"
  • Predicted noise: The network's estimate of what noise was added — subtract it from the noisy latent to get a cleaner image

DiT: why replace U-Net?

DiT-XL/2 (675M parameters) achieves state-of-the-art FID of 2.27 on ImageNet, beating all previous diffusion models. More importantly, DiT shows clean scaling behavior — larger models consistently produce better results, with no architectural bottlenecks.

  • Fréchet Inception Distance (FID) (Heusel et al., 2017) measures the distance between real and generated image distributions using features from a pretrained Inception network. Lower FID = more realistic.
  • Inductive biases are assumptions built into the architecture — e.g., convolutions assume local spatial structure; Transformers make fewer such assumptions and let the model learn structure from data.

U-Nets have strong inductive biases for spatial data (locality, hierarchy). These help with small models but become constraints at scale. Transformers make fewer assumptions — the same lesson we saw with language models (Lectures 15–16).

Peebles & Xie (2023, ICCV) "Scalable diffusion models with Transformers" — DiT showed that Transformers can replace U-Nets in diffusion, and the result scales better.

Try it: generate an image


1from diffusers import StableDiffusionPipeline
2import torch
3
4pipe = StableDiffusionPipeline.from_pretrained(
5    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16
6).to("cuda")
7
8image = pipe("a photo of an astronaut riding a horse on mars").images[0]
9image.save("astronaut.png")
  1. CLIP encodes your text prompt into embeddings
  2. The U-Net iteratively denoises a random latent, guided by those embeddings via cross-attention
  3. CFG amplifies the text signal at each step (default guidance_scale=7.5)
  4. The VAE decoder converts the final latent back to a 512×512 pixel image

Every concept from this lecture is working together in those 5 lines of code.

Take-home messages

  • The key bottleneck in high-resolution generation wasn't the diffusion process itself — it was where you run it. Compressing to latent space via a VAE made consumer-GPU generation possible.
  • Text-to-image generation requires a bridge between modalities: CLIP creates a shared embedding space (extending the idea of word embeddings from Lecture 11), and cross-attention lets the diffusion model "listen" to text at every spatial location (using the same mechanism as encoder-decoder attention from Lecture 15).
  • Classifier-free guidance shows that controlling generation is as important as generation itself — the trick is surprisingly simple: learn what conditional and unconditional outputs look like, then amplify the difference.
  • The field evolves by composing innovations (VAE + CLIP + CFG + U-Net/DiT), not replacing them. Each addresses one specific limitation.

Questions?

📧 Email
💬 Discord

More diffusion applications (text-to-video, text-to-audio) and ethics of multimodal generative models