* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 17: Retrieval augmented generation

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Announcements

Wikipedia Embeddings assignment is due today, Friday, February 6 at 11:59 PM EST.

Submit via pull request after accepting assignment in GitHub Classroom.

Customer Service Chatbot — build a context-aware chatbot using retrieval and generation techniques.

Due: February 16 at 11:59 PM EST.

Learning objectives

  1. Explain why language models need external knowledge
  2. Describe the RAG pipeline: retrieve, augment, generate
  3. Implement a basic RAG system in Python using free, open-source tools
  4. Evaluate tradeoffs between RAG and fine-tuning

The knowledge problem

Parametric knowledge: facts stored inside the model's parameters during training. The model "knows" things because it memorized patterns from its training data.

Non-parametric knowledge: facts stored outside the model in documents, databases, or APIs. The model accesses this knowledge at inference time.

  1. Hallucination: the model confidently generates plausible-sounding but incorrect information
  2. Stale data: knowledge is frozen at training time — the model doesn't know about recent events
  3. No citations: the model can't tell you where it learned something, making claims hard to verify

What is RAG?

RAG is a technique that gives language models access to external knowledge by retrieving relevant documents and including them in the prompt. The model generates its response based on both its parametric knowledge and the retrieved context.

Instead of trying to store all knowledge in the model's parameters, we let the model look things up in a knowledge base at inference time. This separates what the model knows how to do (reasoning, language) from what facts it has access to (documents, data).

The RAG pipeline

Query Embed Retrieve Augment Generate
The five stages of retrieval augmented generation

RAG(q)=LLM(q+retrieve(q,D))\text{RAG}(q) = \text{LLM}(q + \text{retrieve}(q, \mathcal{D}))

where qq is the query and D\mathcal{D} is the document collection.

Embeddings for retrieval

You already know that embeddings map text to vectors where semantic similarity corresponds to geometric proximity. RAG exploits this: embed the query and documents into the same vector space, then find the documents closest to the query using cosine similarity.

For RAG, we use sentence or document embedding models (not word-level) that produce a single vector for an entire passage. Some free and open-source examples:

  • Sentence-BERT (all-MiniLM-L6-v2): fast, good quality, 384 dimensions
  • BGE (BAAI/bge-small-en-v1.5): state-of-the-art for retrieval

Document chunking

Real documents are long. We can't embed an entire book as a single vector (too much information is lost). Instead, we split documents into smaller chunks and embed each chunk separately.

  • Fixed-size: split every nn characters/tokens (simple but may cut mid-sentence)
  • Sentence-based or paragraph-based: split on sentence or paragraph boundaries (preserves meaning better; leverages author intuitions about conceptual units)
  • Semantic: use topic shifts to determine chunk boundaries (best quality but most complex)
  • Recursive: split on paragraphs first, then sentences if chunks are still too long

Too small = lost context. Too large = diluted relevance. In practice, it often works well to use 256-512 tokens per chunk with 50-100 token overlap between adjacent chunks.

Python: embed and retrieve


1from sentence_transformers import SentenceTransformer, util
2
3embedder = SentenceTransformer("all-MiniLM-L6-v2")
4
5# Knowledge base (in practice, loaded from files)
6documents = [
7    "The transformer was introduced by Vaswani et al. in 2017.",
8    "BERT uses bidirectional attention for language understanding.",
9    "GPT models use autoregressive (left-to-right) generation.",
10    "Attention mechanisms allow models to focus on relevant context.",
11    "Fine-tuning adapts a pre-trained model to a specific task.",
12]
13doc_embeddings = embedder.encode(documents, convert_to_tensor=True)
continued...

Python: embed and retrieve


14# Retrieve: embed query, find nearest documents
15query = "How do transformers work?"
16query_emb = embedder.encode(query, convert_to_tensor=True)
17scores = util.cos_sim(query_emb, doc_embeddings)[0]
18top_indices = scores.argsort(descending=True)[:3]
19
20for idx in top_indices:
21    print(f"  [{scores[idx]:.3f}] {documents[idx]}")
...continued

Python: generation with context


1from transformers import pipeline
2generator = pipeline("text2text-generation", model="google/flan-t5-base")
3
4def rag_answer(query, documents, doc_embeddings, top_k=3):
5    """Answer a question using RAG."""
6    query_emb = embedder.encode(query, convert_to_tensor=True)
7    scores = util.cos_sim(query_emb, doc_embeddings)[0]
8    top_idx = scores.argsort(descending=True)[:top_k]
9    context = "\n".join([documents[i] for i in top_idx])
10    prompt = f"""Answer based on context. Say "I don't know" if unsure.
11Context: {context}
12Question: {query}
13Answer:"""
14    return generator(prompt, max_new_tokens=128)[0]["generated_text"]
15
16print(rag_answer("What is BERT?", documents, doc_embeddings))

FLAN-T5 runs locally in Colab. For better quality, try google/flan-t5-large (requires GPU).

Vector databases

A vector database stores embedding vectors and provides fast approximate nearest neighbor (ANN) search. Instead of comparing a query to every document (slow for millions of documents), vector databases use indexing structures to find the most similar vectors in milliseconds.

Brute-force search over nn vectors takes O(n)O(n) time. For 10 million chunks, that's 10 million cosine similarity computations per query. Vector databases use techniques like HNSW (Hierarchical Navigable Small World graphs) to reduce this to roughly O(logn)O(\log n).

ChromaDB works great in practice: no server setup, no API key, just pip install chromadb. For production scale, check out FAISS (Meta) and Pinecone (cloud).

End-to-end RAG with ChromaDB


1import chromadb
2from transformers import pipeline
3
4# Set up ChromaDB (handles embedding automatically)
5client = chromadb.Client()
6collection = client.create_collection("course_notes")
7
8# Add documents
9collection.add(
10    documents=[
11        "The transformer uses self-attention to process sequences in parallel.",
12        "Cross-entropy loss measures prediction error for classification.",
13        "RAG combines retrieval with generation for knowledge-grounded answers.",
14    ],
15    ids=["doc1", "doc2", "doc3"]
16)
continued...

End-to-end RAG with ChromaDB


17# Retrieve relevant documents
18results = collection.query(query_texts=["How does attention work?"], n_results=2)
19context = "\n".join(results["documents"][0])
20
21# Generate answer using retrieved context
22generator = pipeline("text2text-generation", model="google/flan-t5-base")
23prompt = f"Answer based on context.\nContext: {context}\nQuestion: How does attention work?\nAnswer:"
24print(generator(prompt, max_new_tokens=128)[0]["generated_text"])
...continued

RAG vs fine-tuning

RAG Fine-tuning
Knowledge updates Easy — just update the documents Hard — requires retraining
Cost Cheap (no training needed) Expensive (GPU time, data prep)
Citations Natural — you know which docs were retrieved Not possible
Hallucination Reduced (grounded in retrieved text) Can still hallucinate
Domain adaptation Good for factual Q&A Better for style/behavior changes
Latency Higher (retrieval + generation) Lower (just generation)
Analogy Like using a search engine Like learning a new skill

Many production systems use both: fine-tune the model for the domain's style and behavior, then use RAG for up-to-date factual knowledge.

Beyond basic RAG

  1. Re-ranking: after initial retrieval, use a cross-encoder model to re-score and re-order results for better precision (cross-encoders jointly embed pairs of queries and documents)
  2. Hybrid search: combine vector similarity (semantic) with keyword matching (e.g., BM25) for more robust retrieval
  3. Query expansion: rewrite or expand the user's query using an LLM before retrieval to improve recall
  1. Retrieval quality: if the retriever fails to find relevant documents, the generator can't produce a good answer — garbage in, garbage out
  2. Context window limits: there's a limit to how much retrieved text fits in the prompt, requiring aggressive chunking or summarization
  3. Multi-source reasoning: RAG is great for finding a specific fact, but struggles when answers require synthesizing information across many documents

Best practices

  1. Chunk wisely: experiment with chunk sizes (256-512 tokens is a good start). Use overlap between chunks.
  2. Choose the right embedding model: test multiple models on your specific domain. General-purpose models may not capture domain-specific semantics.
  3. Evaluate retrieval separately: before blaming the LLM, check if the retriever is finding the right documents.
  4. Include metadata: store source, date, and section info with chunks so the LLM can cite sources.
  5. Handle "I don't know": instruct the model to say when the context doesn't contain the answer rather than hallucinating.

Try it yourself

Explore our interactive RAG demo to see retrieval augmented generation in action:

RAG System Demo — Search 2,000 Wikipedia articles, configure chunking strategies, compare RAG vs. non-RAG answers, and visualize embedding spaces — all in your browser with no API keys required.

Your next assignment asks you to build a Customer Service Chatbot that uses retrieval and generation techniques. The RAG concepts from today's lecture are directly applicable!

Discussion: RAG in practice

  1. What kinds of applications benefit most from RAG? Where would fine-tuning be better?
  2. A company wants to build a chatbot using internal documents. What are the privacy implications of RAG vs. fine-tuning?
  3. Will RAG eventually be unnecessary if models get large enough to memorize everything?
  4. What happens when the retrieved documents themselves contain misinformation?

References

Lewis et al. (2020, NeurIPS) "Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks" — The original RAG paper.

Borgeaud et al. (2022, ICML) "Improving Language Models by Retrieving from Trillions of Tokens" — RETRO: scaling retrieval to massive corpora.

Gao et al. (2024, arXiv) "Retrieval-Augmented Generation for Large Language Models: A Survey" — Comprehensive survey of RAG techniques.

ChromaDB Documentation — Getting started with vector databases for RAG.

Questions?

📧 Email
💬 Discord

Week 6: BERT deep dive — bidirectional attention, masked language modeling, and the pre-train/fine-tune paradigm!

split: 13

split: 17

split: 16