* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
Models of Language and Conversation

Lecture 22: Scaling Up to GPT-3 and Beyond

The Era of Few-Shot Learning

Models of Language and Conversation

Week 7

Week 7
Models of Language and Conversation

Today's Journey

  1. GPT-2 - 10x scale-up, zero-shot multitask learning
  2. GPT-3 - 100x scale-up, few-shot learning emerges
  3. Scaling Laws - Why bigger is (predictably) better
  4. RLHF & ChatGPT - Aligning LLMs with human preferences
  5. Modern Landscape - Open vs closed models
  6. Hands-on Examples - Prompting techniques in practice
Week 7
Models of Language and Conversation

GPT-2: The Unexpected Leap

Discussion

What happens when we scale up GPT by 10x?

GPT (2018):

  • 117M parameters
  • 5B tokens training
  • BooksCorpus
  • Requires fine-tuning

GPT-2 (2019):

  • 1.5B parameters (13x larger)
  • 40GB text (8x more data)
  • WebText dataset
  • No fine-tuning needed!
Key Claim

"Language models are unsupervised multitask learners"

Reference: Radford et al. (2019) - "Language Models are Unsupervised Multitask Learners"

Week 7
Models of Language and Conversation

The WebText Dataset

How GPT-2 was trained:

WebText Creation
  1. Scrape all outbound links from Reddit with >=3 karma
  2. Filter for quality and diversity
  3. Remove Wikipedia (to avoid test set contamination)
  4. Result: 40GB of text, 8 million documents

Why Reddit links?

  • Community curation (karma = quality signal)
  • Diverse topics and writing styles
  • Web-scale variety
  • Human-filtered content
Think about it!

This introduced a new paradigm: curated web scraping as training data!

Week 7
Models of Language and Conversation

WebText: Concrete Examples

What kinds of documents were included:


1Reddit post (3+ karma): "Check out this great article about climate science"
2 -> Linked article scraped and included in training
3
4Reddit post (3+ karma): "Here's an amazing tutorial on machine learning"
5 -> Tutorial content included in training
6
7Reddit post (2 karma): "Random blog post"
8 -> EXCLUDED (below karma threshold)

Sample document types in WebText:

Source Type Example Why Included
News articles NYT, BBC High quality journalism
Educational Medium posts, tutorials Clear explanations
Forums Stack Overflow answers Technical knowledge
Blogs Personal essays Diverse writing styles
Week 7
Models of Language and Conversation

GPT-2 Model Sizes

Four model sizes released progressively:

Model Parameters Layers Hidden Size
Small 117M 12 768
Medium 345M 24 1024
Large 762M 36 1280
XL 1.5B 48 1600

Staged release strategy:

  • Feb 2019: Released small model (117M)
  • May 2019: Medium model (345M)
  • Aug 2019: Large model (762M)
  • Nov 2019: Full model (1.5B)
  • Concerns about misuse led to gradual release
Week 7
Models of Language and Conversation

Zero-Shot Task Transfer

The surprising finding: GPT-2 can perform tasks without fine-tuning!

Zero-Shot Prompting Examples

Translation:


1English: I love machine learning
2French:

GPT-2 completes: "J'aime l'apprentissage automatique"

Question Answering:


1Answer the question:
2Q: What is the capital of France?
3A:

GPT-2 completes: "Paris"

Summarization:


1[Long article text here]
2
3TL;DR:

GPT-2 completes with a summary

Week 7
Models of Language and Conversation

Zero-Shot: How It Works

GPT-2 saw similar patterns during pre-training:


1Example from training data (hypothetical):
2
3"...The meeting was held in Berlin. The German chancellor...
4Later that day in Tokyo, Japanese officials...
5
6Quick Summary: Leaders from Germany and Japan met to discuss..."

At inference time:


1User prompt: "[Article about climate conference]
2
3TL;DR:"
4
5GPT-2 thinks: "I've seen 'TL;DR:' followed by summaries thousands
6of times in my training. I should output a summary here."

Key insight: Zero-shot works because the model learned task formats implicitly from diverse web text!

Week 7
Models of Language and Conversation

GPT-2 Performance

Zero-shot results on various benchmarks:

Task Metric Fine-tuned SOTA GPT-2 Zero-shot
Translation (En->Fr) BLEU 45.6 11.5
Summarization ROUGE 40.2 29.3
Question Answering Accuracy 89.4 63.1
Reading Comprehension F1 91.8 55.0

Promising:

  • Works without fine-tuning
  • Generalizes across tasks
  • Improves with scale

Limitations:

  • Still behind fine-tuned models
  • Inconsistent quality
  • Hard to control
Week 7
Models of Language and Conversation

Text Generation Quality

GPT-2 Generated Text Sample

Prompt: "In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains."

GPT-2 continues:

"Even more surprising to the researchers was the fact that the unicorns spoke perfect English. The scientist named the population, after their distinctive horn, Ovid's Unicorn. These four-horned, silver-white unicorns were previously unknown to science..."

Observations:

  • Coherent and fluent
  • Maintains context and style
  • Completely fabricated "facts"
  • No grounding in reality
Week 7
Models of Language and Conversation

GPT-3: The 175B Parameter Model

Discussion

What happens when we scale up another 100x?

Model size comparison:

Model Parameters Relative Size
GPT-1 117M 1x
GPT-2 1.5B 13x
GPT-3 175B 1,500x

Key insight: GPT-3 is so large that new capabilities emerge that weren't present in smaller models!

Reference: Brown et al. (2020) - "Language Models are Few-Shot Learners"

Week 7
Models of Language and Conversation

GPT-3 Model Specifications

Component Value
Layers 96
Hidden size (d_model) 12,288
Attention heads 96
Context window 2048 tokens
Training tokens 300 billion
Training data 570GB (filtered)
Training compute ~3,640 petaflop-days
Estimated training cost ~$4.6M
Scale

GPT-3 is so large it has never been fully fine-tuned - only used via API!

Week 7
Models of Language and Conversation

GPT-3 Training Data

Training corpus composition:

Dataset Tokens Weight in Training
Common Crawl (filtered) 410B 60%
WebText2 19B 22%
Books1 12B 8%
Books2 55B 8%
Wikipedia 3B 3%

Key differences from GPT-2:

  • Much larger and more diverse
  • Includes Common Crawl (with quality filtering)
  • Multiple passes over high-quality data
  • Carefully balanced mixture
Week 7
Models of Language and Conversation

Few-Shot Learning

GPT-3's key capability: In-context learning

Learning Paradigms Compared

1. Zero-shot: Task description only


1Translate to French: I love AI ->

2. One-shot: One example


1sea otter -> loutre de mer
2I love AI ->

3. Few-shot: Multiple examples (typically 10-100)


1dog -> chien
2cat -> chat
3bird -> oiseau
4I love AI ->

No gradient updates! Just prompt engineering.

Week 7
Models of Language and Conversation

Few-Shot: Worked Example

Sentiment Classification with 3 examples:


1prompt = """
2Classify the sentiment of each review as Positive or Negative.
3
4Review: "This movie was amazing, I loved every minute!"
5Sentiment: Positive
6
7Review: "Terrible film, complete waste of time."
8Sentiment: Negative
9
10Review: "A masterpiece of modern cinema."
11Sentiment: Positive
12
13Review: "The acting was wooden and the plot made no sense."
14Sentiment:"""
15
16# GPT-3 output: "Negative"

Why this works:

  • Model recognizes the pattern from examples
  • Applies same pattern to new input
  • No weight updates needed!
Week 7
Models of Language and Conversation

In-Context Learning: How It Works

The mechanism behind few-shot learning:


1Prompt structure:
2[Example 1] [Example 2] [Example 3] [New Input]
3
4What GPT-3 "sees":
51. Pattern: "Review: X" followed by "Sentiment: Y"
62. Mapping: positive language -> "Positive"
73. Mapping: negative language -> "Negative"
84. Task: Apply this mapping to new input

Key observations:

  • Model recognizes pattern in examples
  • Continues the pattern for new input
  • No weight updates - pure inference!
  • "Learning" happens at inference time
Important

This is NOT the same as training! The model weights don't change.

Week 7
Models of Language and Conversation

GPT-3's Emergent Abilities

Discussion

What can GPT-3 do that smaller models can't?

Emergent capabilities:

  • Arithmetic: 2-3 digit addition/subtraction
  • Reasoning: Simple logical deduction
  • Code generation: Write simple programs
  • Knowledge synthesis: Combine facts
  • Style transfer: Mimic writing styles
  • Task composition: Multi-step procedures
Scaling Hypothesis

These abilities weren't explicitly trained - they emerged from scale!

Reference: Wei et al. (2022) - "Emergent Abilities of Large Language Models"

Week 7
Models of Language and Conversation

Emergent Abilities: Concrete Examples

Arithmetic (emerges around 10B parameters):


1Q: What is 47 + 58?
2A: 105

Code Generation:


1# Write a function to check if a number is prime
2def is_prime(n):
3    if n < 2:
4        return False
5    for i in range(2, int(n**0.5) + 1):
6        if n % i == 0:
7            return False
8    return True

Multi-step Reasoning:


1Q: If I have 3 apples and give 2 to my friend,
2   then buy 4 more, how many do I have?
3A: 3 - 2 + 4 = 5 apples
Week 7
Models of Language and Conversation

The Scaling Laws Hypothesis

Kaplan et al. (2020) - "Scaling Laws for Neural Language Models"

Model performance scales as a power law with:

  • Model size (parameters)
  • Dataset size (tokens)
  • Compute (FLOPs)

Key findings:

  1. Performance depends strongly on scale
  2. Very weak dependence on model shape (depth vs width)
  3. Smooth, predictable improvements
  4. Optimal compute allocation: Grow model and data together
Think about it!

If scaling laws hold, we can predict future model performance!

Week 7
Models of Language and Conversation

The Scaling Law Formula

Loss as a function of scale:

where:

  • = Cross-entropy loss
  • = Number of parameters
  • = Scaling constant
  • (empirically determined)

In practice, this means:

10x more parameters -> ~15% lower loss
100x more parameters -> ~30% lower loss
1000x more parameters -> ~45% lower loss
Week 7
Models of Language and Conversation

Scaling Laws: Visual Understanding

How loss decreases with scale:


1Loss
2  ^
3  |
43.5|  *
5   |    *
63.0|      *
7   |        *
82.5|          *
9   |            *
102.0|              *  *  *  *  * (diminishing returns)
11   +------------------------------------------>
12       10M   100M   1B    10B   100B   1T
13                   Parameters

Key observation: Returns diminish but never stop - every 10x increase helps!

Week 7
Models of Language and Conversation

Implications of Scaling Laws

Good news:

  • Predictable improvements
  • Clear path to better models
  • Can plan compute budgets
  • Smooth progress curve

Challenges:

  • Diminishing returns
  • Exponential cost increase
  • Hardware limitations
  • Environmental impact
The Cost of Scaling

To halve the loss:

  • Need ~10,000x more compute
  • GPT-3 cost ~$4.6M to train
  • GPT-4 estimated at $100M+
Week 7
Models of Language and Conversation

Chinchilla Scaling Laws

Hoffmann et al. (2022)

Previous models were over-parameterized and under-trained!

Key insight:

  • For a given compute budget, should balance model size and data
  • Optimal ratio: ~20 tokens per parameter

Comparison:

Model Parameters Tokens Tokens/Param
GPT-3 175B 300B 1.7 (under-trained!)
Chinchilla 70B 1.4T 20 (optimal)

Result: Chinchilla (70B) outperforms GPT-3 (175B)!

This influenced Llama 2, GPT-4, and other modern models

Week 7
Models of Language and Conversation

The Path to ChatGPT

Evolution from GPT-3 to ChatGPT:

Stage Model Key Innovation
1 GPT-3 Predict next token
2 InstructGPT Follow instructions
3 ChatGPT Helpful & harmless chat

Three key innovations:

  1. Instruction tuning: Train to follow instructions
  2. RLHF: Reinforcement Learning from Human Feedback
  3. Safety guardrails: Reduce harmful outputs
Week 7
Models of Language and Conversation

Instruction Tuning

What is Instruction Tuning?

Fine-tune the model on (instruction, response) pairs to make it better at following user commands.

Training examples:


1Instruction: "Explain quantum computing to a 5-year-old"
2Response: "Imagine you have a magic coin that can be heads
3          AND tails at the same time until you look at it..."
4
5Instruction: "Write a Python function to sort a list"
6Response: "def sort_list(items):
7              return sorted(items)"
8
9Instruction: "Summarize this article in 3 sentences"
10Response: [concise 3-sentence summary]
Week 7
Models of Language and Conversation

Before vs After Instruction Tuning

Before (GPT-3):


1User: "Write a haiku about AI"
2
3GPT-3: "Write a haiku about AI
4is a common creative writing
5exercise that many people
6enjoy. Here are some tips
7for writing haikus..."

Continues describing rather than doing

After (InstructGPT):


1User: "Write a haiku about AI"
2
3InstructGPT: "Silicon neurons
4Learning patterns in the void
5Dreams in binary"

Actually writes the haiku!

Week 7
Models of Language and Conversation

RLHF: Reinforcement Learning from Human Feedback

The 3-step RLHF process:


1Step 1: Supervised Fine-tuning (SFT)
2- Train on human-written examples of good responses
3- Model learns basic instruction-following
4
5Step 2: Reward Model Training
6- Generate multiple responses to same prompt
7- Humans rank responses from best to worst
8- Train a model to predict human preferences
9
10Step 3: RL Optimization (PPO)
11- Generate responses with policy model
12- Score with reward model
13- Update policy to maximize reward
Week 7
Models of Language and Conversation

RLHF: Worked Example

Training the reward model:


1Prompt: "How do I make a cake?"
2
3Response A (Rating: 4/5):
4"Here's a simple recipe: Preheat oven to 350F.
5Mix 2 cups flour, 1.5 cups sugar..."
6
7Response B (Rating: 2/5):
8"Cake is a type of dessert that originated in
9ancient civilizations..."
10
11Response C (Rating: 1/5):
12"I cannot help with that request."
13
14Reward model learns:
15- Helpful, direct answers get high scores
16- Off-topic or unhelpful responses get low scores
Week 7
Models of Language and Conversation

ChatGPT's Impact

Launched: November 30, 2022

Growth:

  • 1 million users in 5 days
  • 100 million users in 2 months
  • Fastest-growing consumer application ever

Why so successful?

  • Easy to use (conversational interface)
  • Broadly capable (many tasks)
  • Accessible (free tier)
  • Impressive demos went viral
  • Timing (post-pandemic digital adoption)
Cultural Impact

ChatGPT brought LLMs into mainstream consciousness and sparked an AI revolution.

Week 7
Models of Language and Conversation

The Modern LLM Landscape

Post-GPT-3 developments (2020-2024):

Year Milestone
2021 Anthropic founded (Claude)
2022 ChatGPT launched
2023 GPT-4 (multimodal, improved reasoning)
2023 Llama 2 (open weights, 70B params)
2023 Gemini (Google's multimodal LLM)
2024 Claude 3, GPT-4o, Llama 3
2024 Smaller efficient models (Phi, Mistral)

Key trends:

  1. Multimodal capabilities (vision, audio)
  2. Longer context windows (100K+ tokens)
  3. Better reasoning and factuality
  4. Open-source alternatives
  5. Efficiency improvements
Week 7
Models of Language and Conversation

Open Source LLMs

Discussion

Should powerful AI models be open or closed?

Closed (GPT-4, Claude):

  • Better safety control
  • Monetization easier
  • Protect IP
  • No transparency
  • Vendor lock-in
  • Limited customization

Open (Llama, Mistral):

  • Transparency
  • Community innovation
  • Full control
  • No API costs
  • Potential misuse
  • Compute requirements
Week 7
Models of Language and Conversation

Practical Prompting Techniques

Effective prompting strategies for modern LLMs:

1. Zero-shot with clear instructions:


1Classify the following text as spam or not spam.
2Only respond with "spam" or "not spam".
3
4Text: "Congratulations! You've won $1,000,000!"

2. Few-shot with examples:


1Text: "Meeting at 3pm tomorrow" -> not spam
2Text: "URGENT: Send money now!" -> spam
3Text: "Can you review this document?" -> not spam
4Text: "You've been selected for a prize!" ->
Week 7
Models of Language and Conversation

Chain-of-Thought Prompting

For complex reasoning tasks:


1Q: Roger has 5 tennis balls. He buys 2 more cans of
2   tennis balls. Each can has 3 balls. How many
3   tennis balls does he have now?
4
5Let's think step by step:
61. Roger starts with 5 tennis balls
72. He buys 2 cans of tennis balls
83. Each can has 3 balls, so 2 cans = 2 * 3 = 6 balls
94. Total = 5 + 6 = 11 tennis balls
10
11A: 11 tennis balls
Key insight

Adding "Let's think step by step" dramatically improves reasoning accuracy!

Week 7
Models of Language and Conversation

Current Limitations

What LLMs still struggle with:

  1. Factual accuracy

    • Hallucinations and confabulation
    • No citations or sources
  2. Reasoning

    • Multi-step logic
    • Mathematical proofs
  3. Knowledge grounding

    • Knowledge cutoff date
    • Can't access real-time info
  4. Personalization

    • No persistent memory
    • Stateless conversations
  5. Reliability

    • Inconsistent outputs
    • Prompt sensitivity
Week 7
Models of Language and Conversation

Key Takeaways

  1. Scaling works

    • GPT -> GPT-2 -> GPT-3 showed clear improvements
    • Power law scaling continues to hold
  2. Few-shot learning emerged at scale

    • No fine-tuning needed for many tasks
    • In-context learning is powerful
  3. Scaling laws provide predictability

    • But diminishing returns and compute costs are real
    • Chinchilla scaling: balance model size and data
  4. RLHF changed everything

    • ChatGPT = GPT-3.5 + instruction tuning + RLHF
    • Alignment is crucial for deployment
  5. The field is rapidly evolving

    • Open vs closed debate continues
    • New capabilities emerging
Week 7
Models of Language and Conversation

Readings

Required Readings
  1. Radford et al. (2019) - "Language Models are Unsupervised Multitask Learners" (GPT-2) [PDF]
  2. Brown et al. (2020) - "Language Models are Few-Shot Learners" (GPT-3) [ArXiv]
  3. Kaplan et al. (2020) - "Scaling Laws for Neural Language Models" [ArXiv]
Recommended Readings
  • Wei et al. (2022) - "Emergent Abilities of Large Language Models" [ArXiv]
  • Ouyang et al. (2022) - "Training language models to follow instructions" (InstructGPT) [ArXiv]
  • Hoffmann et al. (2022) - "Training Compute-Optimal Large Language Models" (Chinchilla) [ArXiv]
Week 7
Models of Language and Conversation

Next Lecture Preview

Lecture 23: Implementing GPT from Scratch
  • Building a mini-GPT in PyTorch
  • Tokenization with BPE
  • Training loop and optimization
  • Sampling strategies (greedy, top-k, nucleus)
  • Hands-on coding session

Questions?

Week 7