* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 22: Scaling up to GPT-3 and beyond

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Describe how GPT-2 demonstrated zero-shot task transfer through scale
  2. Explain how GPT-3 enabled few-shot learning without fine-tuning
  3. Outline the alignment pipeline from RLHF to DPO to GRPO
  4. Explain how reasoning models (o1, DeepSeek-R1) work and why they matter
  5. Critically evaluate the debate over emergent abilities in LLMs

GPT-2: unsupervised multitask learning

GPT-2 (Radford et al., 2019) scaled GPT-1 by an order of magnitude and discovered that larger models can perform tasks without any fine-tuning.

GPT-1 (2018) GPT-2 (2019)
Parameters 117M 1.5B (13×)
Training data BooksCorpus (5B tokens) WebText (40GB, ~8×)
Fine-tuning needed? Yes No (zero-shot transfer)

"Language models are unsupervised multitask learners" — a single model trained on next-token prediction implicitly learns to translate, summarize, answer questions, and more. OpenAI staged the release over 9 months due to disinformation concerns — one of the first high-profile responsible AI releases.

Zero-shot task transfer

GPT-2 can perform tasks it was never explicitly trained for, simply by prompting with the right text format. This works because the model encountered similar patterns in its diverse training data.

Translation: "English: I love machine learning\nFrench:""J'aime l'apprentissage automatique"

Question answering: "Q: What is the capital of France?\nA:""Paris"

Summarization: "[Long article]\n\nTL;DR:" → generates a summary

The model learned these formats from web text where such patterns naturally occur (bilingual pages, Q&A forums, Reddit TL;DR summaries). Performance was far behind fine-tuned SOTA, but the fact that it worked at all was groundbreaking.

GPT-3: few-shot learning at scale

GPT-3 scaled up another 100× (175B parameters, 300B tokens, ~$4.6M) and discovered in-context learning — the ability to learn new tasks from just a few examples in the prompt, with no gradient updates.


1Classify each review as Positive or Negative.
2
3Review: "This movie was amazing, I loved every minute!"
4Sentiment: Positive
5
6Review: "Terrible film, complete waste of time."
7Sentiment: Negative
8
9Review: "The acting was wooden and the plot made no sense."
10Sentiment:

GPT-3 output: "Negative" — no weight updates, no fine-tuning, pure pattern recognition.

Beyond Chinchilla: the overtraining era

Model Parameters Training tokens Tokens/parameter Strategy
GPT-3 175B 300B 1.7 Undertrained
Chinchilla 70B 1.4T 20 Compute-optimal
LLaMA 3 70B 15T 214 Inference-optimal
Phi-4 14B 10T+ 700+ Extreme overtraining

The field has moved past Chinchilla-optimal. Since you train once but deploy billions of times, overtraining smaller models minimizes total cost. Phi-4 (14B) matches GPT-3.5 (175B) through massive overtraining on curated data.

From GPT-3 to ChatGPT

Stage Model Key innovation
1 GPT-3 (2020) Next-token prediction at massive scale
2 InstructGPT (2022) Instruction tuning on human-written responses
3 ChatGPT (Nov 2022) RLHF (Lecture 16) for helpful, harmless conversation

ChatGPT reached 100M users in 2 months — the fastest-growing consumer app in history. The key innovation wasn't the model (GPT-3.5) but the alignment: RLHF transformed a next-token predictor into a conversational assistant that felt helpful. This revealed that alignment technique matters as much as model scale.

Beyond RLHF: DPO and GRPO

RLHF requires training a separate reward model and running PPO — complex and unstable. Two alternatives have emerged:

DPO (Rafailov et al., NeurIPS 2023): Direct Preference Optimization skips the reward model entirely. It directly optimizes the LLM to prefer winning responses over losing ones using a simple classification loss. Same quality as RLHF, ~3× less compute.

GRPO (Shao et al., 2024): Group Relative Policy Optimization from DeepSeekMath. Instead of human rankings, it generates multiple responses and uses a verifiable reward (e.g., "is the math answer correct?") to rank them. No humans needed for domains with checkable answers.

Alignment is getting cheaper, simpler, and more automated. This democratizes the ability to create instruction-following models — but also lowers the barrier for misuse.

Chain-of-thought prompting

Chain-of-thought (CoT) prompting asks the model to show its reasoning step by step before giving a final answer. This dramatically improves performance on reasoning tasks.


1Q: Roger has 5 tennis balls. He buys 2 more cans of tennis
2   balls. Each can has 3 balls. How many does he have now?
3
4Let's think step by step:
51. Roger starts with 5 tennis balls
62. He buys 2 cans × 3 balls per can = 6 new balls
73. Total = 5 + 6 = 11 tennis balls
8
9A: 11

Simply adding "Let's think step by step" improves accuracy on GSM8K math problems from 17.7% to 78.7% (Kojima et al., 2022). The model "knows" how to reason — it just needs permission to show its work.

Reasoning models: thinking before answering

Instead of making models bigger (training-time scaling), reasoning models spend more compute at inference time by generating an internal chain-of-thought before answering.

Model Organization Key innovation
o1 (2024) OpenAI Hidden chain-of-thought, trained with RL to "think"
o3 (2025) OpenAI Extended reasoning, SOTA on ARC-AGI
DeepSeek-R1 (2025) DeepSeek Open-weight, trained with GRPO, reasoning emerges from RL alone

R1 was trained with pure RL — no supervised reasoning examples. Yet it spontaneously developed chain-of-thought, self-correction, and even metacognition ("wait, let me reconsider..."). Then R1's reasoning was distilled into smaller models (Llama-70B, Qwen-32B), giving them reasoning abilities at a fraction of the cost.

Are emergent abilities real?

Wei et al. (2022) claimed that abilities like arithmetic and reasoning emerge suddenly at certain scales — absent in small models, present in large ones.

Schaeffer et al. (2023, NeurIPS) challenged this: they showed that "emergence" disappears when you switch from nonlinear metrics (exact-match accuracy) to linear metrics (token-level accuracy). The abilities were developing gradually all along — the metric just couldn't detect partial progress.

If emergence is real, it means we can't predict what larger models will do — they might develop unexpected and potentially dangerous capabilities. If it's a measurement artifact, then scaling is predictable and we can plan for it. The answer has profound implications for AI safety policy.

The modern LLM landscape (2025)

Year Milestone
2022 ChatGPT launched (OpenAI) — 100M users in 2 months
2023 GPT-4 (multimodal), Llama 2 (open weights), Claude 2
2024 Claude 3.5, GPT-4o, Llama 3, Mistral Large, Gemini 1.5
2024–25 Reasoning models (o1, o3, R1), small efficient models (Phi-4, Gemma 2)
  1. Reasoning at inference time — spending more compute when thinking, not just when training
  2. Longer context windows — 128K–1M+ tokens (Gemini 1.5 Pro)
  3. Open weight models — LLaMA, Mistral, DeepSeek approaching frontier quality
  4. Multimodal — text + vision + audio in a single model
  5. Efficiency — smaller models matching larger predecessors (Phi-4 14B ≈ GPT-3.5 175B)

Current limitations of LLMs

  • Hallucinations: Confidently generate false statements with no awareness of uncertainty
  • Reasoning depth: Multi-step logic and mathematical proofs remain unreliable (even reasoning models have limits)
  • Knowledge currency: Training data has a cutoff date; no access to real-time information
  • Consistency: May give different answers to the same question across runs
  • Prompt sensitivity: Small wording changes can dramatically alter outputs

Each limitation has spawned research directions: RAG for knowledge currency (Lecture 17), tool use and agents for grounding (Lecture 24), constitutional AI for alignment, reasoning models for reliability, and formal verification for correctness guarantees.

Discussion

  1. Emergent abilities: If Schaeffer is right that "emergence" is a measurement artifact, does that make scaling more or less concerning? What if Wei is right?

  2. Reasoning models: DeepSeek-R1 developed chain-of-thought reasoning through pure RL — no human examples. Does this constitute "learning to think"? How is this different from how children learn to reason?

  3. The alignment tax: DPO and GRPO make alignment cheaper and more accessible. Is this good (more aligned models) or dangerous (easier to create models aligned to harmful objectives)?

  4. Open vs closed: DeepSeek-R1 is fully open. OpenAI's o1 is closed. If open models reach frontier quality, can safety-through-secrecy still work? Should it?

Further reading

Brown et al. (2020, NeurIPS) "Language Models are Few-Shot Learners" — GPT-3: in-context learning at scale.

Rafailov et al. (2023, NeurIPS) "Direct Preference Optimization" — RLHF without the RL.

DeepSeek-AI (2025, arXiv) "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning" — Open-weight reasoning model.

Schaeffer et al. (2023, NeurIPS) "Are Emergent Abilities of Large Language Models a Mirage?" — The metric artifact hypothesis.

Kojima et al. (2022, NeurIPS) "Large Language Models are Zero-Shot Reasoners" — "Let's think step by step."

OpenAI (2024, arXiv) "o1 System Card" — Reasoning via test-time compute scaling.

Questions?

📧 Email
💬 Discord

Implementing GPT from scratch: building a language model in PyTorch