* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 23: Diffusion models for video and audio; ethics of generative AI

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Explain how Sora extends diffusion to video via spacetime patches
  2. Describe how text-to-audio systems generate sound using spectrogram-based diffusion
  3. Compare open-source text-to-video models and their architectural choices
  4. Evaluate ethical implications of multimodal generative AI: deepfakes, consent, bias, and regulation

Roadmap and companion notebook

There are no classes February 23–27 (I'll be away!). Use this time to work on your final project and the optional Assignment 5 (Build GPT).

📓 Companion Notebook — generate video from text, create audio with AudioLDM 2, and visualize spectrograms.

The Sora pipeline

Sora (OpenAI, 2024) extends the DiT architecture (Lecture 22) from 2D image patches to 3D spacetime patches:

  1. Patchify: Divide the 3D latent volume into spacetime cubes → a token sequence
  2. Denoise: A DiT transformer processes the tokens, conditioned on text via cross-attention
  3. Unpatchify: Reassemble tokens back into a 3D latent volume
  4. Decode: The video VAE decoder reconstructs pixel-space video frames

Sora full pipeline

OpenAI (2024) "Video generation models as world simulators" — Sora treats video generation as spacetime denoising.

The video VAE

Like the image VAE from Lecture 22, a video VAE compresses input into a lower-dimensional latent space. The key difference: it uses 3D convolutions that compress along the temporal dimension as well as spatial dimensions.

  • Encoder: Raw video (T × H × W × 3) → 3D latent (t × h × w × C) with t ≪ T, h ≪ H, w ≪ W
  • Decoder: 3D latent → reconstructed video at original resolution
  • This compression makes diffusion tractable — denoising operates in the small latent space, not on raw pixels

Video VAE architecture

3D spacetime patchify

After the video VAE compresses video into a 3D latent, patchify converts it into a sequence of tokens — just like ViT (Lecture 22) does for images:

  1. Divide the 3D latent volume into small cubes (e.g., 2 × 2 × 2 in height × width × time)
  2. Flatten each cube into a vector
  3. Project each vector into the transformer's embedding dimension via a linear layer

Spacetime patchify

Because patches are resolution-agnostic, Sora can handle any resolution, aspect ratio, or duration — the token sequence simply gets longer or shorter. This is a major advantage over fixed-resolution architectures.

Sora: emergent capabilities

Sora exhibits surprising behaviors not explicitly trained:

  • 3D consistency: Objects maintain shape as the camera moves
  • Long-range coherence: Characters persist across scene changes
  • Physics simulation: Water flows, reflections update, objects interact plausibly
  • Variable format: Handles different resolutions, aspect ratios, and durations — because spacetime patches are resolution-agnostic

OpenAI describes Sora as a "world simulator." Does a model that generates plausible physics understand physics, or is it pattern-matching at a scale we find convincing? This connects to our discussion of language and understanding (Lecture 20) — predicting the next token (or the next frame) may be more powerful than it appears.

The text-to-video landscape

All major text-to-video models share Sora's core recipe — video VAE + DiT + text conditioning — but vary in scale, training data, and architectural details.

Text-to-video model comparison

Hong et al. (2024) "CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer" — 3D causal VAE + expert adaptive LayerNorm.

Wan Video Team (2025) "Wan 2.1" — Top VBench scores; 1.3B model runs on consumer GPUs (~8 GB VRAM).

From waveforms to spectrograms

Audio generation applies diffusion to spectrograms — visual representations of sound frequencies over time:

  1. Convert: Transform audio into a mel spectrogram (frequency bands × time steps, weighted to match human hearing)
  2. Compress: A VAE encodes the spectrogram into a latent space (just like image latent diffusion)
  3. Denoise: Latent diffusion generates a clean spectrogram conditioned on text
  4. Reconstruct: A vocoder (e.g., HiFi-GAN) converts the spectrogram back to an audio waveform

Audio diffusion pipeline

By converting audio to a spectrogram, we turn a 1D temporal signal into a 2D image — and the entire image diffusion toolkit (VAE, latent diffusion, text conditioning) transfers directly.

Mel spectrograms: turning sound into images

A mel spectrogram is a 2D image representing sound. It is computed by:

  1. Short-time Fourier transform (STFT): Slice the audio into overlapping windows and compute the frequency content of each
  2. Mel filter bank: Re-weight frequency bins onto the mel scale, which spaces low frequencies widely and high frequencies narrowly — matching how humans perceive pitch
  3. Log scaling: Convert power to decibels for better dynamic range

The result: a 2D image (frequency × time) that diffusion models can process using the same architectures designed for photographs.

Mel spectrogram

Text conditioning in AudioLDM 2

AudioLDM 2 uses dual cross-attention to condition the UNet on text — similar to text-to-image diffusion (Lecture 22), but with two embedding sources:

  1. CLAP + Flan-T5 encode the text prompt into embeddings
  2. GPT-2 auto-regressively generates audio-space embeddings from the text embeddings
  3. The UNet receives two cross-attention inputs: Flan-T5 embeddings AND GPT-2 output embeddings

Cross-attention works the same way here as in text-to-image: the queries come from the audio latent features, and the keys/values come from the text embeddings. This lets the denoising network "look up" which parts of the text are relevant at each spatial/frequency location.

CLAP captures audio-text alignment (like CLIP for images), while Flan-T5 provides rich semantic understanding. The GPT-2 bridge translates between text and audio embedding spaces — producing the "language of audio" (LOA).

Text-to-audio systems

System Modality Approach Key feature
AudioLDM 2 Music + speech + effects Latent diffusion CLAP text encoder
Stable Audio (Stability AI) Music + effects Latent diffusion + timing Controllable duration
MusicGen (Meta) Music Autoregressive (not diffusion) Single-stage, no vocoder
Bark (Suno) Speech Autoregressive + diffusion Multilingual, emotion control

Diffusion and autoregressive approaches are converging — many systems use hybrid architectures.

  • CLAP (Contrastive Language-Audio Pretraining): The audio equivalent of CLIP — learns a shared embedding space for text and audio, enabling text-conditioned generation
  • Vocoder: A neural network (e.g., HiFi-GAN) that reconstructs audio waveforms from spectrograms

The vocoder: from spectrograms to sound

Converting a mel spectrogram back to audio is not a simple inverse transform. Spectrograms discard phase information — the timing relationships between frequency components. Recovering natural-sounding audio requires a learned neural network.

Vocoder pipeline

HiFi-GAN (Kong et al., 2020) is a GAN-based vocoder: a generator uses transposed convolutions to progressively upsample the spectrogram to waveform resolution, while multi-scale discriminators judge audio quality at different time scales. It produces 22 kHz audio at 168× real-time speed.

Ethics: deepfakes and consent

Diffusion models can generate photorealistic images of people who never consented to being depicted. This has led to:

  • Non-consensual intimate imagery: Deepfake tools targeting individuals (predominantly women), with devastating personal consequences
  • Political disinformation: Fabricated images of public figures in compromising situations, weaponized during elections
  • Identity fraud: Generated faces used for fake accounts, scam operations, and social engineering

A 2019 Sensity AI (Deeptrace) report found that 96% of deepfake videos online are non-consensual intimate imagery, and the number of deepfake videos doubled in just 9 months. By 2023, Sumsub reported a 10× increase in detected deepfakes year-over-year. The democratization of generation tools has far outpaced legal and technical protections.

Ethics: bias in generated content

Diffusion models trained on internet-scale datasets inherit (and sometimes amplify) societal biases:

  • Gender stereotypes: "CEO" generates predominantly white male faces; "nurse" generates predominantly female faces
  • Racial bias: Prompts for "beautiful person" over-represent light-skinned individuals
  • Cultural erasure: Non-Western artistic styles, architectural traditions, and cultural contexts are underrepresented
  • Homogenization: Generated images converge toward a narrow aesthetic — the "AI look" — reducing visual diversity

Bias exists at every level: in the training data (internet images skew Western/male/young), in the text encoder (CLIP's training data has similar biases), and in the evaluation metrics (FID scores reward realism of common scenes over diverse representation).

Ethics: copyright and training data

Case Status Key issue
Getty Images v. Stability AI UK: copyright claims rejected (Nov 2025); US: ongoing Training on copyrighted stock photos
Andersen v. Stability AI Class action (2023 –); trial Sept 2026 Artists' styles replicated without consent
NYT v. OpenAI Filed Dec 2023; copyright claims survive (Mar 2025) Verbatim reproduction of articles
Thomson Reuters v. Ross Ruled Feb 2025 — fair use defense failed Training on proprietary legal database

Training data is scraped from the internet without explicit consent. Artists argue this constitutes copyright infringement — their styles are being replicated without compensation. Companies argue this is "fair use" and transformative. Courts are still deciding, but the outcome will shape the future of all generative AI, not just diffusion models.

Ethics: regulation and provenance

  • EU AI Act (2024): Requires labeling of AI-generated content, transparency about training data, risk classification for generative systems
  • C2PA (Coalition for Content Provenance and Authenticity): Technical standard for embedding provenance metadata in images and videos — "nutrition labels" for digital content
  • US Executive Order 14110 (Oct 2023): Requires watermarking of AI-generated content from government contractors
  • China's deep synthesis regulations (2023): Mandatory labeling and registration of deepfake services

Regulation works when enforced. But technical solutions (watermarking, detection models) face an arms race: as detectors improve, generators adapt. The most promising approach may be provenance — embedding an unforgeable record of how content was created, rather than trying to detect fakes after the fact.

Discussion

  1. The world simulator question: Sora generates videos with plausible physics. Does this mean it has learned a model of the physical world, or is it pattern-matching at a scale we find convincing? How would we tell the difference?

  2. The spectrogram trick: Converting audio to spectrograms lets us reuse image diffusion tools for sound. What other modalities could be "converted to images" and generated this way? What are the limits of this approach?

  3. Regulation tradeoffs: Strict regulation of generative AI could slow harmful applications but also impede beneficial research. How should society balance these? Is open-source part of the problem or part of the solution?

  4. The compression connection: Language compresses thought (Lecture 20). VAEs compress images. Spectrograms compress audio. Diffusion generates by decompressing noise. Is there a deep connection between communication, compression, and generation?

Take-home messages

  • The same diffusion framework scales across modalities — images (Lecture 22), video (Sora), and audio (AudioLDM) — suggesting iterative refinement from noise is a general-purpose generation principle.
  • The spectrogram trick illustrates a powerful pattern: convert your data into a format where existing tools work, then convert back. Turning audio into "images" unlocks the entire latent diffusion pipeline.
  • Sora's emergent physics simulation raises the question: is predicting the next frame enough to learn a world model? The same question we asked about language (Lecture 20) now applies to vision.
  • The ethics of generative AI are not optional add-ons — consent, bias, copyright, and provenance are central design challenges, not afterthoughts.

Further reading

OpenAI (2024) "Video Generation Models as World Simulators" — Sora: spacetime patches and emergent physics.

Liu et al. (2023, IEEE/ACM TASLP) "AudioLDM 2: Learning Holistic Audio Generation with Self-supervised Pretraining" — Latent diffusion for audio with CLAP conditioning.

Hong et al. (2024) "CogVideoX: Text-to-Video Diffusion Models with An Expert Transformer" — Open-source text-to-video with 3D causal VAE.

Yang et al. (2024, ACM Computing Surveys) "Diffusion Models: A Comprehensive Survey of Methods and Applications" — Broad overview of diffusion across modalities.

Questions?

📧 Email
💬 Discord

Week 9 (after break): Agents and tool use — giving language models the ability to act in the world, mixture-of-expert models