* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication - Week 3

Classic Embeddings: LSA & LDA

Lecture 9: The Foundations of Distributional Semantics

PSYC 51.07: Models of Language and Communication - Week 3

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Today's Lecture 📋

  1. 🧠 The Distributional Hypothesis
  2. 📊 Vector Space Models
  3. 📚 Latent Semantic Analysis (LSA)
  4. 🎲 Latent Dirichlet Allocation (LDA)
  5. 🎨 Modern Topic Modeling: BERTopic
  6. 🔍 Evaluation & Applications

Goal: Understand how classic methods represent meaning through co-occurrence

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

The Fundamental Question 🤔

How do we represent the meaning of words in a way that computers can understand?

\pause

Traditional Approach:

  • Dictionaries
  • Taxonomies (WordNet)
  • Manual feature engineering
  • Knowledge bases

Labor-intensive, hard to scale

Distributional Approach:

  • Learn from data
  • Context-based
  • Scalable
  • Unsupervised

Let the data tell us what words mean!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

The Distributional Hypothesis 📖

"You shall know a word by the company it keeps"

--- J.R. Firth (1957)

\pause

Key Idea:
Words that appear in similar contexts tend to have similar meanings.

Example
  • "The cat sat on the mat"
  • "The dog sat on the mat"
  • "The kitten sat on the mat"

cat, dog, kitten are semantically related

Reference: Firth, J.R. (1957). "A synopsis of linguistic theory 1930-1955"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

From Words to Vectors 🗺️

Goal: Represent each word as a point in high-dimensional space

Concrete Example

Consider a tiny corpus with 3 documents:

  • Doc 1: "The cat sat on the mat"
  • Doc 2: "The dog ran in the park"
  • Doc 3: "The cat and dog played"

Word-context co-occurrence:

Word appears with "the" appears with "sat" appears with "ran"
cat 2 1 0
dog 2 0 1
mat 1 1 0

Problem: Real vocabularies have 10,000+ words, contexts are even more numerous!

We need dimensionality reduction 📉

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Building a Term-Document Matrix 📊

Step 1: Count how often each word appears in each document

cat 0 4 1 0
computer 0 0 0 5
code 0 0 0 3
animal 2 3 2 0

Issues:

  • Very sparse (mostly zeros)
  • High dimensional (vocab size × doc count)
  • Doesn't capture semantic relationships

\pause

Apply TF-IDF weighting and dimensionality reduction!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

TF-IDF Weighting ⚖️

Term Frequency - Inverse Document Frequency

Term Frequency (TF):

How often does term appear in document ?

Rewards frequent terms in document

Inverse Document Frequency (IDF):

How rare is term across all documents?

Penalizes common words like "the", "and"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

TF-IDF: Worked Example 🔢

Corpus: 3 documents, 1000 total documents in collection

Document Text
Doc 1 "machine learning is great"
Doc 2 "deep learning models"
Doc 3 "machine translation works"

Calculate TF-IDF for "learning" in Doc 1:


1Step 1: TF("learning", Doc1) = 1/4 = 0.25  (1 occurrence, 4 words)
2
3Step 2: IDF("learning") = log(1000/500) = log(2) = 0.301
4        (appears in 500 of 1000 docs)
5
6Step 3: TF-IDF = 0.25 × 0.301 = 0.075

Compare: "the" (appears in 950 docs):


1IDF("the") = log(1000/950) = 0.022  ← Much lower! Common words penalized
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Latent Semantic Analysis (LSA) 📚

The OG of semantic embeddings (1990)

Algorithm:

  1. Build term-document matrix
  2. Apply TF-IDF weighting
  3. Perform SVD:
  4. Keep top dimensions
  5. Use as word embeddings

Key Innovation:

  • Discovers latent topics
  • Solves synonymy problem
  • Matrix factorization
  • Captures semantic similarity

SVD Decomposition:


1X        =   U    ×    Σ    ×    V^T
2(m×n)      (m×k)    (k×k)     (k×n)
3
4words×docs  words×  topic    topics×
5            topics  strength  docs

Matrix Interpretation:

  • : word-topic associations
  • : topic strengths
  • : topic-document associations

Reference: Deerwester et al. (1990) - "Indexing by Latent Semantic Analysis"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LSA: Step-by-Step Worked Example 🔍

Mini corpus (4 words × 3 documents):


1           Doc1    Doc2    Doc3
2cat         2       0       1
3dog         0       3       1
4pet         1       2       0
5animal      1       1       1

Step 1: Apply SVD

Step 2: Keep top 2 dimensions (k=2)


1from sklearn.decomposition import TruncatedSVD
2import numpy as np
3
4X = np.array([[2,0,1], [0,3,1], [1,2,0], [1,1,1]])
5svd = TruncatedSVD(n_components=2)
6word_embeddings = svd.fit_transform(X)
7
8print("Word vectors (2D):")
9print(f"cat:    [{word_embeddings[0,0]:.2f}, {word_embeddings[0,1]:.2f}]")
10print(f"dog:    [{word_embeddings[1,0]:.2f}, {word_embeddings[1,1]:.2f}]")
11print(f"pet:    [{word_embeddings[2,0]:.2f}, {word_embeddings[2,1]:.2f}]")
12print(f"animal: [{word_embeddings[3,0]:.2f}, {word_embeddings[3,1]:.2f}]")

Result: "pet" and "animal" end up close together in 2D space!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LSA: How It Works 🔍

Example: Discovering latent topics

Original Dimensions:

  • 50,000 terms
  • 10,000 documents
  • Matrix: 50k × 10k
  • Sparse and noisy

After LSA:

  • 100-300 latent dimensions
  • Dense representation
  • Captures semantic patterns
  • Easier to compute

Discovered Topics (example):

Topic 1: Technology

computer (0.8), software (0.7), code (0.6), algorithm (0.5)

Topic 2: Animals

dog (0.9), cat (0.8), pet (0.7), animal (0.6)

Topic 3: Sports

game (0.8), team (0.7), player (0.6), win (0.5)

Words with similar topic distributions are semantically similar!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LSA Limitations ⚠️

Computational Issues:

  • SVD is expensive:
  • Hard to update with new documents
  • Memory intensive for large corpora

Modeling Issues:

  • Linear relationships only
  • Bag-of-words (loses word order)
  • No polysemy handling
  • Negative values (hard to interpret)

Example Problem: Polysemy

Word: "bank"
  1. Financial institution
  2. River bank
  3. Banking (airplane maneuver)

LSA gives for all meanings!

But...

  • Still useful for many tasks
  • Foundation for modern methods
  • Fast for medium-sized corpora
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Latent Dirichlet Allocation (LDA) 🎲

A probabilistic approach to topic modeling

What if we model documents as probabilistic mixtures of topics?

Key Assumptions:

  1. Each document is a mixture of topics
  2. Each topic is a distribution over words
  3. The mixture proportions vary per document

Advantages over LSA:

  • Probabilistic interpretation
  • Non-negative weights
  • Better for topic discovery
  • More interpretable

1Topic 1 -> Topic 2 -> Topic 3 -> sports, game -> tech, code -> animal, pet

Reference: Blei et al. (2003) - "Latent Dirichlet Allocation"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LDA: The Generative Story 📖

How LDA imagines documents are created:

Generative Process:

  1. Choose number of topics
  2. For each topic :
    • Draw word distribution
  3. For each document :
    • Draw topic distribution
    • For each word position :
      • Choose topic
      • Choose word

Key Parameters:

  • : Document-topic density
    • Low few topics per doc
    • High many topics per doc
  • : Topic-word density
    • Low focused topics
    • High general topics

Inference:

  • Given documents, infer topics
  • Use Gibbs sampling or variational inference
  • Iterative process
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LDA: Concrete Generative Example 🎲

Imagine generating a document about "tech pets":


1Step 1: Pick topic mixture for this document
2        θ_doc = [0.6 Tech, 0.3 Animals, 0.1 Sports]
3
4Step 2: For each word, sample a topic, then sample a word:
5
6Word 1: Sample topic → Tech (60% chance)
7        Sample word from Tech → "software"
8
9Word 2: Sample topic → Animals (30% chance)
10        Sample word from Animals → "cat"
11
12Word 3: Sample topic → Tech (60% chance)
13        Sample word from Tech → "computer"
14
15Word 4: Sample topic → Animals (30% chance)
16        Sample word from Animals → "pet"
17
18Result: "software cat computer pet"

LDA reverses this: Given documents, infer the topics!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LDA in Practice 💻


1from sklearn.decomposition import LatentDirichletAllocation
2from sklearn.feature_extraction.text import CountVectorizer
3
4# Prepare documents
5documents = [
6    "The cat sat on the mat",
7    "The dog ran in the park",
8    "Machine learning is amazing",
9    # ... more documents
10]
11
12# Create bag-of-words representation
13vectorizer = CountVectorizer(max_features=1000, stop_words='english')
14doc_term_matrix = vectorizer.fit_transform(documents)
15vocab = vectorizer.get_feature_names_out()
16
17# Fit LDA model
18lda = LatentDirichletAllocation(
19    n_components=10,  # number of topics
20    random_state=42,
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LDA in Practice 💻


21    max_iter=50
22)
23
24doc_topics = lda.fit_transform(doc_term_matrix)
25
26# Print top words per topic
27for idx, topic in enumerate(lda.components_):
28    top_words = [vocab[i] for i in topic.argsort()[-10:]]
29    print(f"Topic {idx}: {', '.join(top_words)}")
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LDA Example: Topic Discovery 🔍

Example Topics from News Corpus:

Topic 1: Politics (20%)

government, election, president, vote, policy, senate, congress, party

Topic 2: Sports (15%)

game, team, player, score, win, season, coach, championship

Topic 3: Technology (25%)

software, computer, data, algorithm, system, internet, code, AI

Topic 4: Finance (40%)

market, stock, price, investment, economy, trade, bank, profit

Document representation: [0.20, 0.15, 0.25, 0.40] mainly finance

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Modern Topic Modeling: BERTopic 🎨

Combining neural embeddings with topic models

BERTopic Pipeline:

  1. Embed documents with BERT
  2. Reduce dimensionality with UMAP
  3. Cluster with HDBSCAN
  4. Generate topics via c-TF-IDF
  5. Fine-tune with MaximalMarginalRelevance

Advantages:

  • Leverages pre-trained models
  • Better semantic understanding
  • Automatic topic number detection
  • Dynamic topic modeling
  • Hierarchical topics

1from bertopic import BERTopic
2from sentence_transformers import SentenceTransformer
3
4# Initialize
5embedding_model = SentenceTransformer(
6    "all-MiniLM-L6-v2"
7)
8
9model = BERTopic(
10    embedding_model=embedding_model,
11    language="english",
12    calculate_probabilities=True,
13    verbose=True
14)
15
16# Fit and transform
17topics, probs = model.fit_transform(
18    documents
19)
20
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Modern Topic Modeling: BERTopic 🎨


21# Get topic info
22topic_info = model.get_topic_info()
23
24# Visualize
25fig = model.visualize_topics()
...continued

Reference: Grootendorst (2022) - "BERTopic: Neural topic modeling with a class-based TF-IDF procedure"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

LSA vs. LDA vs. BERTopic 📊

Interpretability Medium High High
Topic coherence Medium High Very High
Computation Fast Medium Slow
Scalability Good Good Medium
Pre-training No No Yes
Hyperparameters Few Many Many
Topic number Fixed Fixed Automatic
Recommendations
  • LSA: Quick exploration, large-scale retrieval
  • LDA: Interpretable topics, when you know topic count
  • BERTopic: State-of-the-art quality, exploratory analysis
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Evaluating Topic Models 📏

Intrinsic Metrics:

  • Perplexity: Lower is better
    • Measures likelihood
    • Can be misleading!
  • Topic Coherence: Higher is better
    • Measures semantic similarity
    • Better correlation with human judgment
    • CV, UCI, UMass variants
  • Topic Diversity:
    • Unique words across topics
    • Avoids redundant topics

Extrinsic Metrics:

  • Document classification accuracy
  • Information retrieval performance
  • Clustering quality

Human Evaluation:

  • Topic interpretability
  • Word intruder detection
  • Topic utility for task
Important

Perplexity usefulness!
Always check topic coherence and interpretability.

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Coherence Evaluation: Code Example 💻


1from gensim.models import LdaModel
2from gensim.models.coherencemodel import CoherenceModel
3from gensim.corpora import Dictionary
4
5# Prepare corpus
6texts = [doc.split() for doc in documents]
7dictionary = Dictionary(texts)
8corpus = [dictionary.doc2bow(text) for text in texts]
9
10# Train LDA with different topic numbers
11coherence_scores = []
12for num_topics in [5, 10, 15, 20, 25]:
13    lda = LdaModel(corpus, num_topics=num_topics,
14                   id2word=dictionary, passes=10)
15
16    # Calculate coherence (C_V is recommended)
17    coherence = CoherenceModel(model=lda, texts=texts,
18                               dictionary=dictionary,
19                               coherence='c_v')
20    coherence_scores.append((num_topics, coherence.get_coherence()))
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Coherence Evaluation: Code Example 💻


21
22# Results: [(5, 0.42), (10, 0.51), (15, 0.48), (20, 0.45), (25, 0.41)]
23# Best: 10 topics with coherence 0.51
...continued

Rule of thumb: Higher coherence = more interpretable topics

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Applications of Classic Embeddings 🚀

  1. Information Retrieval
    • Semantic search
    • Document similarity
    • Query expansion
  2. Document Organization
    • Clustering
    • Topic discovery
    • Trend analysis
  3. Text Mining
    • Opinion mining
    • Literature review
    • Knowledge discovery
  4. Preprocessing
    • Feature reduction for ML
    • Noise reduction
    • Transfer learning

Real-world examples:

  • Academic paper recommendations (LSA)
  • News article categorization (LDA)
  • Social media trend detection (BERTopic)
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Application Example: Semantic Search with LSA 🔎


1from sklearn.decomposition import TruncatedSVD
2from sklearn.metrics.pairwise import cosine_similarity
3
4# Documents already vectorized with TF-IDF
5# tfidf_matrix shape: (1000 docs, 5000 words)
6
7# Apply LSA to reduce dimensions
8lsa = TruncatedSVD(n_components=100)
9doc_embeddings = lsa.fit_transform(tfidf_matrix)
10
11# User query: "machine learning algorithms"
12query_tfidf = vectorizer.transform(["machine learning algorithms"])
13query_embedding = lsa.transform(query_tfidf)
14
15# Find most similar documents
16similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
17top_5_docs = similarities.argsort()[-5:][::-1]
18
19print("Most relevant documents:")
20for idx in top_5_docs:
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Application Example: Semantic Search with LSA 🔎


21    print(f"  Doc {idx}: similarity = {similarities[idx]:.3f}")
...continued

Key insight: LSA finds documents about "neural networks" and "deep learning" even though those exact words weren't in the query!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Discussion Question 💬

Can distributional semantics truly capture meaning?

Arguments For:

  • Captures semantic similarity
  • Works well in practice
  • Aligns with linguistic theory
  • Scalable to large corpora
  • Unsupervised learning

Arguments Against:

  • Text-only (no grounding)
  • Missing embodied experience
  • No common sense reasoning
  • Cultural context limited
  • Reflects biases in data
The Symbol Grounding Problem

"How can the semantic interpretation of a formal symbol system be made intrinsic to the system, rather than just parasitic on the meanings in our heads?" - Stevan Harnad (1990)

Reference: Harnad, S. (1990). "The symbol grounding problem"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Practical Tips 💡

  1. Preprocessing Matters:
    • Remove stopwords (but not always!)
  • Lemmatization vs. stemming
  • Handle punctuation carefully
  • Consider bigrams/trigrams
  1. Choosing Parameters:
    • Start with 10-50 topics for LDA
  • Use perplexity for model selection
  • Validate with coherence scores
  • Try multiple random seeds
  1. Interpreting Results:
    • Look at top 10-20 words per topic
  • Examine representative documents
  • Check for duplicate/junk topics
  • Visualize with pyLDAvis
  1. When to Use What:
    • LSA: Fast exploration, search
  • LDA: Interpretable topics
  • BERTopic: Best quality, latest research
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Summary 🎯

What we learned today:

  1. Distributional Hypothesis: Words in similar contexts have similar meanings
  2. LSA (1990):
    • SVD on term-document matrix
  • Linear dimensionality reduction
  • Fast but limited interpretability
  1. LDA (2003):
    • Probabilistic topic modeling
  • Documents as mixtures of topics
  • Highly interpretable
  1. BERTopic (2022):
    • Neural embeddings + clustering
  • State-of-the-art coherence
  • Automatic topic detection
  1. Evaluation: Coherence perplexity
  2. Limitation: Static representations, no grounding

Next: Neural word embeddings (Word2Vec, GloVe, FastText)!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Key References 📚

Classic Papers:

  • Firth, J.R. (1957). "A synopsis of linguistic theory 1930-1955"
  • Deerwester et al. (1990). "Indexing by Latent Semantic Analysis"
  • Blei et al. (2003). "Latent Dirichlet Allocation"
  • Harnad, S. (1990). "The symbol grounding problem"

Modern Extensions:

  • Boleda, G. (2020). "Distributional Semantics and Linguistic Theory"
  • Grootendorst, M. (2022). "BERTopic: Neural topic modeling with a class-based TF-IDF procedure"
  • Grand et al. (2022). "Semantic projection recovers rich human knowledge of multiple object features"

Tools & Libraries:

  • Scikit-learn: sklearn.decomposition.TruncatedSVD, LatentDirichletAllocation
  • Gensim: gensim.models.LsiModel, LdaModel
  • BERTopic: https://maartengr.github.io/BERTopic/
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 3

Questions? 🙋

Next Lecture:

Word Embeddings: Word2Vec, GloVe, FastText

From count-based to prediction-based methods!

Winter 2026