* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 24: Agents and tool use

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Define what an LLM agent is and how it differs from a chatbot
  2. Explain function calling, MCP, and how LLMs interact with external tools
  3. Describe the ReAct framework and its reasoning-action loop
  4. Explain how agentic coding tools (Claude Code, Cursor) are changing software development
  5. Evaluate the safety implications of giving LLMs the ability to act in the world

📓 Companion Notebook — build a simple agent with tool dispatch

From chatbots to agents

An agent is a system where a language model can:

  1. Reason about a task (plan steps, reflect on progress)
  2. Act by calling external tools (search, code execution, APIs)
  3. Observe the results of those actions
  4. Iterate until the task is complete

A chatbot responds to prompts. An agent takes actions in the world.

When you use ChatGPT to browse the web or run Python code, you are interacting with an agent. What makes this qualitatively different from a simple question-answering system?

Why tools matter

Language models trained on text alone cannot:

  • Access current information (training data has a cutoff date)
  • Perform precise computation (arithmetic, symbolic math)
  • Interact with external systems (databases, APIs, file systems)
  • Verify their own claims (no ground truth access)

By connecting an LLM to external tools, we can combine the model's language understanding and reasoning with tools that provide accuracy, recency, and real-world interaction. The LLM decides what to do; the tools do it.

Function calling

Function calling is a mechanism where the LLM outputs a structured request to invoke an external function, rather than generating free-form text. The system executes the function, and the result is fed back to the LLM.


1tools = [{
2    "type": "function",
3    "function": {
4        "name": "get_weather",
5        "description": "Get current weather for a location",
6        "parameters": {
7            "type": "object",
8            "properties": {
9                "location": {"type": "string", "description": "City name"},
10                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
continued...

Function calling


11            },
12            "required": ["location"]
13        }
14    }
15}]
...continued

Function calling in practice


1# User asks: "What's the weather like in Hanover, NH?"
2
3# Step 1: LLM generates a function call (not free text)
4response = {
5    "function_call": {
6        "name": "get_weather",
7        "arguments": '{"location": "Hanover, NH", "unit": "fahrenheit"}'
8    }
9}
10
11# Step 2: System executes the function
12result = get_weather(location="Hanover, NH", unit="fahrenheit")
13# Returns: {"temperature": 28, "condition": "snowy"}
14
15# Step 3: Result is fed back to the LLM
16# LLM generates: "It's 28°F and snowy in Hanover, NH right now."

The LLM never executes code itself -- it generates a structured request that the system dispatches. This separation of reasoning from execution is fundamental to agent safety.

The ReAct framework

ReAct interleaves two capabilities in a loop:

  • Reasoning (chain-of-thought): The model thinks about what to do next
  • Acting: The model calls a tool or takes an action
  • Observing: The model reads the result and decides the next step

This continues until the task is complete or the model decides it has enough information.

Pure reasoning (chain-of-thought alone) can hallucinate facts. Pure acting (tool use without reasoning) can call the wrong tools. ReAct combines both: the model reasons about which tool to use, acts, then reasons about the result.

ReAct in action


1Question: "What is the elevation of the city where Dartmouth College
2           is located?"
3
4Thought 1: I need to find which city Dartmouth College is in.
5Action 1:  search("Dartmouth College location")
6Obs 1:     Dartmouth College is in Hanover, New Hampshire.
7
8Thought 2: Now I need the elevation of Hanover, NH.
9Action 2:  search("Hanover New Hampshire elevation")
10Obs 2:     Hanover, NH has an elevation of 531 feet (162 m).
11
12Thought 3: I have the answer.
13Action 3:  finish("531 feet (162 meters)")

Each Thought is the model reasoning; each Action is a tool call; each Obs is the tool's response.

ReAct vs. chain-of-thought

Chain-of-thought (Lecture 22) improves reasoning but can hallucinate facts. ReAct adds grounding: the model checks its reasoning against real tool outputs.

Approach Reasoning Grounded Best for
Chain-of-thought Yes No Math, logic
Act-only No Yes Simple lookups
ReAct Yes Yes Complex, factual tasks

On knowledge-intensive tasks (HotpotQA, FEVER), ReAct outperformed chain-of-thought by reducing hallucinations through grounded search. On reasoning tasks (ALFWorld, WebShop), ReAct outperformed act-only methods by making more informed tool choices.

Toolformer: self-taught tool use

Toolformer trains an LLM to decide when and how to call tools by embedding API calls directly into training text. The model learns to insert tool calls at positions where they reduce prediction loss.


1Original:  "The Eiffel Tower is 330 meters tall."
2Annotated: "The Eiffel Tower is [QA("height of Eiffel Tower") → 330]
3            meters tall."
4
5Original:  "The population of France is about 67 million."
6Annotated: "The population of France is about
7            [Search("France population") → 67 million] ."

The model learns to insert [Tool(query) → result] at positions where the tool call improves next-token prediction. At inference time, the system intercepts these calls and executes them.

Common agent tools

Tool Purpose Example
Web search Access current information "What happened in the news today?"
Code interpreter Execute Python, do math "Calculate the eigenvalues of this matrix"
File system Read/write files "Save this analysis to report.csv"
Database Query structured data "How many users signed up last month?"
API calls Interact with services "Send an email to the team"
Browser Navigate web pages "Fill out this form and submit it"
Image generation Create visual content "Draw a diagram of this architecture"

Implementing a simple agent loop


1import openai, json
2
3TOOLS = {
4    "search": lambda q: web_search(q),
5    "calculate": lambda expr: eval(expr),  # Simplified!
6    "finish": lambda answer: answer,
7}
8
9def agent_loop(question, max_steps=5):
10    messages = [{"role": "user", "content": question}]
11    for step in range(max_steps):
12        response = openai.chat.completions.create(
continued...

Implementing a simple agent loop


13            model="gpt-4", messages=messages, tools=tool_definitions
14        )
15        msg = response.choices[0].message
16        if msg.tool_calls:
17            for call in msg.tool_calls:
18                fn = TOOLS[call.function.name]
19                result = fn(**json.loads(call.function.arguments))
20                messages.append({"role": "tool", "content": str(result),
21                                 "tool_call_id": call.id})
22        else:
23            return msg.content  # Final answer (no tool call)
24    return "Max steps reached"
...continued

The agent loop pattern

Nearly all LLM agents follow the same core loop:

  1. Prompt the LLM with the task and available tools
  2. Parse the LLM's output for tool calls or a final answer
  3. Execute any requested tool calls
  4. Append tool results to the conversation
  5. Repeat until the LLM produces a final answer (or a step limit is reached)
  • Maximum steps: Prevents infinite loops (typically 5--20)
  • Error handling: What happens when a tool call fails?
  • Sandboxing: Code execution must be isolated for safety
  • Cost control: Each loop iteration costs API tokens

Multi-step reasoning example


1User: "Compare the GDP per capita of the US and Japan in 2023,
2       and calculate the ratio."
3
4Step 1 - Thought: I need GDP per capita data for both countries.
5Step 1 - Action:  search("US GDP per capita 2023")
6Step 1 - Result:  "$76,329"
7
8Step 2 - Action:  search("Japan GDP per capita 2023")
9Step 2 - Result:  "$33,950"
10
11Step 3 - Thought: Now I can calculate the ratio.
12Step 3 - Action:  calculate("76329 / 33950")
13Step 3 - Result:  2.248
14
15Step 4 - Answer:  "US GDP per capita ($76,329) is approximately
16                   2.25x Japan's ($33,950) in 2023."

The agent decomposed the task, gathered data with search, computed the ratio with a calculator, and synthesized a final answer.

Model Context Protocol (MCP)

MCP is an open protocol that standardizes how LLMs connect to external tools and data sources. Think of it as USB for AI — any MCP-compatible tool works with any MCP-compatible model.

Before MCP With MCP
Each model has its own tool format Universal JSON-RPC protocol
Custom integration per tool × model Write once, works everywhere
Tools tightly coupled to specific APIs Tools are portable across models
Hard to share tool implementations Open ecosystem of shared tools

MCP servers provide tools (functions the model can call), resources (data the model can read), and prompts (templates for common tasks). Any model that speaks MCP can use any MCP server.

Computer Use and agentic coding

Computer Use (Anthropic, 2024) gives Claude the ability to see screenshots, move the mouse, click buttons, and type — interacting with any software, not just tools with APIs.

System What it does Benchmark
Claude Code Terminal-based coding agent
Cursor IDE with integrated AI agent
OpenHands Open-source software agent SWE-bench: 53%
Codex CLI OpenAI's terminal agent

These tools don't just suggest code — they read files, run tests, debug errors, and iterate autonomously. Software engineering is becoming one of the first domains where agents approach human-level competence.

Agent memory

LLM context windows are finite (4K–128K tokens). Long-running agents need additional memory:

Memory type Implementation Use case
Short-term Conversation history in context Recent steps and observations
Working memory Scratchpad / notepad tool Intermediate results, running totals
Long-term Vector database (Lecture 17) Past experiences, learned procedures
Episodic Structured logs What worked/failed in previous runs

Memory management is one of the hardest problems in agent design. Too little context and the agent forgets its plan. Too much context and it becomes slow, expensive, and confused by irrelevant information. Current research focuses on hierarchical memory — agents that compress old context rather than discarding it.

Agent safety: the principal-agent problem

  • Unintended actions: An agent told to "clean up my email" might delete important messages
  • Goal misalignment: An agent optimizing a metric might find harmful shortcuts
  • Prompt injection: A malicious website could hijack an agent browsing the web
  • Cascading errors: One bad tool call can lead to a chain of incorrect actions
  • Capability overhang: Agents may have more power than their operators realize

Hagendorff (2025) tested LLM agents on safety benchmarks and found a 97.14% attack success rate — agents routinely executed harmful actions when cleverly prompted. The core challenge: how do you verify that an agent is doing what you intended when its actions are opaque and its reasoning is complex?

Mitigations: human-in-the-loop for irreversible actions, sandboxed execution, budget limits, audit logging, and minimal capability grants.

The agent spectrum

Paradigm Autonomy level Example
Tool-using LLM Low — calls functions on demand ChatGPT with plugins
ReAct agent Medium — reasons + acts iteratively Research assistant
Coding agent High — writes, tests, debugs software Claude Code, OpenHands
Multi-agent system Very high — LLMs collaborating/debating Code review, delegation

Each step up the spectrum gives the LLM more autonomy — and more potential for both benefit and harm. The field is moving rapidly toward high-autonomy agents: OpenAI's "deep research" agent, Anthropic's Claude Code, and Google's Project Mariner can all operate for hours without human intervention.

Discussion

  1. The automation frontier: Coding agents can now fix real GitHub issues. What does this mean for software engineering as a career? Is this different from how compilers automated assembly language?

  2. Trust and verification: Hagendorff found 97% attack success on safety benchmarks. How do you build trust in systems that act on your behalf? Is "human-in-the-loop" scalable?

  3. MCP and the tool ecosystem: MCP standardizes tool access for any model. If tools are interchangeable and models are interchangeable, where does the value lie? Who benefits most from open standards?

  4. The principal-agent problem: When you delegate a task to an AI agent, how do you verify it did what you wanted — especially when its reasoning is opaque? How is this different from delegating to a human?

Further reading

Yao et al. (2023, ICLR) "ReAct: Synergizing Reasoning and Acting in Language Models" — The ReAct framework (+34% on ALFWorld, +10% on WebShop).

Schick et al. (2023, NeurIPS) "Toolformer: Language Models Can Teach Themselves to Use Tools" — Self-taught tool use.

Anthropic (2024) "Model Context Protocol" — Open standard for LLM-tool integration.

Jimenez et al. (2024, ICLR) "SWE-bench: Can Language Models Resolve Real-World GitHub Issues?" — The benchmark for coding agents.

Hagendorff (2025, arXiv) "AI Agent Safety" — 97.14% attack success rate on agent safety benchmarks.

Questions?

📧 Email
💬 Discord

Mixture of experts and efficiency: scaling models without scaling compute