* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 24: The thinking revolution

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Explain chain-of-thought prompting — why intermediate reasoning steps improve LLM performance
  2. Describe how reasoning models are trained via reinforcement learning on verifiable rewards
  3. Explain the mechanics of test-time compute scaling — what happens inside a reasoning model at inference
  4. Analyze why thinking works: the relationship between token generation and computation
  5. Evaluate the evidence: is longer "thinking" genuine reasoning or sophisticated pattern completion?
  6. Explain Mixture of Experts — how sparse activation lets models store more knowledge while keeping inference costs manageable

Welcome back!

This is our final week of new material. Three (content) lectures remain:

  • Today: The thinking revolution — how and why reasoning models work
  • Wednesday: Agents, tools, and the agentic era
  • Friday: The reckoning — society, safety, and what comes next

Final Project presentations are on March 9 (next Monday, our final day of class). Deliverables:

  • A notebook containing your project's code, results, and analysis (should run in Google Colab)
  • A presentation (presented in class on March 9) summarizing what you did, what you found, and why it matters. You should also upload your slides to Canvas.
  • A final report (roughly 2—5 pages) describing your project in detail

I can be available during our X-hour (Thursday) to help with final projects, if there is interest in this. I'll treat it as an open Q&A session; I'll plan to stay, answer questions or help on a first-come-first-served basis, until there are no more questions, and then we'll wrap up.

A new paradigm: test-time compute

Every model we've studied so far (rules-based models, embeddings, autoregressive models, and diffusion models) improved primarily by making training bigger: more data, more parameters, more compute during training. In late 2024, a new paradigm emerged: test-time compute scaling — spending more compute at inference to improve results.

A smaller model that "thinks longer" can (sometimes) outperform a larger model that answers immediately. This decouples model quality from model size in a way that changes how we build and deploy AI.

Check out some interactive examples in the companion notebook.

Two scaling axes

Training compute Inference compute
When Before deployment (once) At query time (every call)
What improves Base knowledge, capabilities Reasoning depth on hard problems
Scaling law Kaplan et al. (2020) Snell et al. (2024)

Two scaling axes

  • Traditional scaling: improving models by increasing training data, parameters, and compute (Kaplan et al., 2020)
  • Test-time scaling: improving outputs by spending more compute at inference (Snell et al., 2024)
  • Scaling law: a power-law relationship between compute invested and model performance
  • "Both axes" (purple in chart): models like o3 that invest heavily in both training and inference compute

Chain-of-thought prompting

Chain-of-thought (CoT) prompting showed that LLMs perform dramatically better on reasoning tasks when prompted to generate intermediate steps before answering. This requires no model changes — just different prompting.

Chain-of-thought

Each generated token is a computation step. When a model writes "2 cans × 3 = 6," it's predicting "6" as the most likely next token based on patterns learned during training — and that prediction then gets stored in context, where subsequent tokens can build on it. More intermediate tokens = more serial computation = harder problems become solvable.

GSM8K (Cobbe et al., 2021) — Grade School Math 8K, a dataset of 8,500 grade-school math word problems requiring 2–8 reasoning steps using basic arithmetic. A standard benchmark for multi-step mathematical reasoning in LLMs.

Why do intermediate steps help?

  • Threshold circuit: a Boolean circuit whose gates are threshold functions — a gate outputs 1 if at least kk of its nn inputs are 1. These circuits can compute addition, multiplication, and sorting.
  • Constant-depth (TC0\mathsf{TC}^0): a threshold circuit with a fixed number of layers, no matter the input size. It can do a lot in parallel but has limited sequential depth.
  • Why this matters: a transformer with LL layers is essentially a constant-depth threshold circuit — it always performs exactly LL sequential steps per token, regardless of problem difficulty.

A transformer with LL layers performs LL sequential computation steps per token. For a 100-layer model answering in one token, you get 100 steps of computation — regardless of problem difficulty.

But if the model generates NN intermediate tokens first, it gets L×NL \times N steps — the entire model runs once per token, and each token can attend to all previous tokens.

Merrill & Sabharwal (2024) proved that constant-depth transformers are limited to problems in the complexity class TC0\mathsf{TC}^0 (constant-depth threshold circuits). But with a chain-of-thought of length TT, a transformer can simulate TT steps of any Turing machine — making it Turing-complete.

In plain language: without CoT, transformers literally cannot solve certain problems no matter how large. With CoT, they can solve anything (given enough tokens).

From prompting to training: the evolution

Year Technique Key idea Reference
2022 Chain-of-thought prompting Prompt the model with reasoning examples Wei et al.
2023 Tree-of-thought, self-consistency Generate multiple paths, pick the best Yao et al.
2024 Reasoning models (o1) Train the model via RL to reason on its own OpenAI
2025 Adaptive thinking (Claude 4.x) Model decides when and how much to think Anthropic

CoT prompting (Wei et al., 2022) showed that intermediate reasoning helps. Reasoning models take this further: instead of prompting the model to reason, you train it via reinforcement learning on verifiable rewards (correct math, passing code tests). The model learns to generate its own internal reasoning traces — and these traces can be far more effective than human-written examples.

How reasoning models are trained

The key innovation: instead of supervised learning (human writes the reasoning), use reinforcement learning where the model discovers its own reasoning strategies:

  1. Sample: Model generates multiple complete solutions (reasoning + answer) for each problem
  2. Verify: Check which answers are correct (math: exact match; code: passes unit tests)
  3. Update: Reinforce reasoning patterns that led to correct answers; suppress those that didn't
  4. Repeat: Over millions of problems, the model learns which reasoning strategies work

This only works for verifiable tasks — problems where we can automatically check correctness. Math, coding, and formal logic have clear right/wrong answers. Open-ended writing, ethics, and creative tasks do not. This is a fundamental limitation of the RL approach to reasoning.

GRPO: how DeepSeek-R1 learns to reason

DeepSeek-R1 uses GRPO (Group Relative Policy Optimization) — a simpler alternative to PPO (Proximal Policy Optimization, the standard RL algorithm used for RLHF, which requires a separate "critic" model to estimate value). GRPO eliminates the critic by comparing solutions within each group:

  1. For each problem, generate a group of GG candidate solutions (e.g., G=64G = 64)
  2. Score each: correct answer → reward +1+1, wrong → reward 00
  3. Compute relative advantage: how much better/worse than the group average?
  4. Update the model to increase probability of above-average solutions

1Problem: "What is 17 × 23?"
2  Solution 1: "17 × 23 = 17 × 20 + 17 × 3 = 340 + 51 = 391" ✓ → reinforce
3  Solution 2: "17 × 23 = 17 × 25 - 17 × 2 = 425 - 34 = 391" ✓ → reinforce
4  Solution 3: "17 × 23 = 300 + 91 = 391"                    ✓ → reinforce (less)
5  Solution 4: "17 × 23 = 381"                               ✗ → suppress

What emerges from RL training

DeepSeek-R1 was fine-tuned with RL only — no human-written reasoning examples at all. The reward signal was solely whether the final answer was correct. Yet the model spontaneously developed:

  • Self-verification: "Let me check this... wait, that's wrong"
  • Backtracking: "Actually, I should try a different approach"
  • Structured reasoning: Breaking problems into numbered sub-steps
  • Reflection: "This seems too easy, let me double-check"

These reasoning strategies were never demonstrated to the model. They emerged purely because they lead to more correct answers. The model "discovered" that doubting itself, re-examining its work, and trying alternative approaches is useful — through optimization pressure alone.

The American Invitational Mathematics Examination is a 15-question, 3-hour competition for top high school math students. Problems cover algebra, geometry, number theory, and combinatorics. Each answer is an integer 0–999. It is widely used as a benchmark for AI mathematical reasoning.

DeepSeek-R1's performance on the AIME jumped from 15.6% (base model) to 71.0% (after RL training).

The reasoning model pipeline

Reasoning model pipeline

  1. User sends a query — the model begins generating "thinking tokens"
  2. Internal reasoning — the model works through the problem step-by-step, potentially backtracking and self-correcting (this is where the extra compute goes)
  3. Final answer — a polished response generated after reasoning is complete
  4. RL reward loop (during training only) — correct answers reinforce the reasoning patterns that produced them

What thinking tokens look like in practice

When a reasoning model processes a query, it generates two types of content:

  1. Thinking tokens — internal reasoning (may be hidden or visible depending on the provider)
  2. Output tokens — the final answer shown to the user

1[THINKING]  Assume √2 = p/q, coprime. Then 2q²=p²,
2            so p even. Let p=2k → q also even.
3            Contradiction! ✓
4[OUTPUT]    Proof: √2 is irrational by contradiction...
OpenAI o-series Claude DeepSeek-R1
Thinking visibility Hidden (summarized) Visible via API Visible (open-weight)
Control reasoning_effort budget_tokens Token count
Thinking tags Not exposed thinking block in response <think>...</think> tags

The s1 experiment: reasoning is surprisingly simple

The s1 model showed that test-time scaling can be achieved with remarkably little effort:

  1. Fine-tune Qwen2.5-32B on just 1,000 curated examples — the "s1K" dataset, a curated set of 1,000 challenging math, science, and coding problems with detailed reasoning traces
  2. At inference, apply budget forcing: when the model tries to stop reasoning, append the token "Wait" to the model's own output, forcing it to continue thinking rather than giving a final answer
  3. Result: Exceeds o1-preview (a much larger model, trained using human feedback) on AIME 2024 (the 15-question high school math competition from a few slides ago) by up to 27%!

You don't need massive RL infrastructure to get reasoning capabilities. A small amount of high-quality reasoning data + a simple inference trick can unlock substantial test-time scaling. This suggests reasoning is latent in large pretrained models — it just needs to be activated.

Claude's extended thinking

Anthropic implemented reasoning as a toggle within a single model — same weights, different inference behavior. Unlike OpenAI, Claude's thinking tokens are visible to developers and can be controlled using their API.


1from anthropic import Anthropic
2
3client = Anthropic()  # Reads ANTHROPIC_API_KEY from environment
4response = client.messages.create(
5    model="claude-sonnet-4-6-20250514",
6    max_tokens=16000,
7    thinking={"type": "enabled", "budget_tokens": 10000},
8    messages=[{"role": "user", "content": "Prove that √2 is irrational."}]
9)
10# Response includes a visible "thinking" block + final answer
Feature OpenAI o-series Claude extended thinking
Thinking visibility Hidden Visible
Control mechanism Reasoning effort (low/med/high) Budget tokens (1K–128K) or adaptive
Latest innovation o4-mini: tools in reasoning loop Interleaved thinking: reason between tool calls

Mixture of Experts: doing more with less

In a standard (dense) transformer, every parameter activates for every token. In a Mixture of Experts (MoE) model (Shazeer et al., 2017), each layer contains NN expert sub-networks (FFNs), but only kk are activated per token:

  1. A small router network scores each expert for the current token
  2. The top-kk experts are selected (e.g., 2 of 8, or 8 of 256)
  3. Only selected experts compute; the rest are skipped entirely
  4. Outputs are combined as a weighted sum based on router scores

MoE architecture

Model Total params Active per token Efficiency
Mixtral 8x7B 46.7B 12.9B (2/8 experts) 3.6×
DeepSeek-V3 671B 37B (8/256 experts) 18×
Llama 4 Maverick 400B 17B (1/128 experts) 23×

Efficiency = total / active — how much more knowledge the model stores vs. what it computes per token. MoE was first proposed by Shazeer et al. (2017) and refined by Fedus et al. (2022).

The frontier landscape: early 2026

Model Developer Key capability
Claude Opus 4.6 Anthropic 1M context, adaptive thinking, visible reasoning
GPT-5 / 5.2 OpenAI Native multimodal, 100% AIME 2025 (GPT-5.2)
Gemini 3.1 Pro Google 1M context, built-in thinking, 48.4% HLE
DeepSeek-R1 DeepSeek Open-weight reasoning, 671B MoE, $5.5M training
Llama 4 Maverick Meta Open-weight, 400B MoE, 1M context
Qwen 3 Alibaba 81.5% AIME 2025 (235B base), open-weight

Nearly every frontier model now uses both reasoning capabilities and Mixture of Experts. DeepSeek-V3 trained for ~$5.6M — roughly 1/20th of GPT-4's estimated cost — proving that reasoning (RL) + efficient architecture (MoE) is a winning combination.

Benchmark saturation and the reasoning gap

Benchmark What it measures Best Human Status
MATH-500 500 competition-level math problems 99.4% Saturated
AIME 2025 15-question high school math competition 100% Saturated
GPQA Diamond PhD-level science questions 94.3% ~65% Near-saturated
SWE-bench Verified Real GitHub issue resolution 80.9% Active
ARC-AGI-2 Novel visual reasoning patterns ~54% 60% Closing
Humanity's Last Exam Expert cross-domain questions 48.4% ~90% Not saturated

Thinking dramatically helps on problems that decompose into verifiable steps (math, coding). It helps less on problems requiring novel abstractions (ARC-AGI-2) or deep domain expertise (HLE). More thinking tokens ≠ more understanding — it's more computation within the same learned representations. ARC-AGI-2 has seen rapid recent progress (~54%, up from 4.4% in late 2024), though a gap to human performance remains.

Questions?

📧 Email
💬 Discord

Agents, tools, and the agentic era: when LLMs start acting in the world