* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication - Week 4

Contextual Embeddings: ELMo, USE, BERT

Lecture 12: Beyond Static Word Representations

PSYC 51.07: Models of Language and Communication - Week 4

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Today's Lecture 📋

  1. 🔄 From Static to Contextual
  2. 🧬 Language Models as Feature Extractors
  3. 🔮 ELMo: Embeddings from Language Models
  4. 🌐 Universal Sentence Encoder
  5. 🎭 BERT: Bidirectional Transformers
  6. 📊 Comparison & Applications

Goal: Understand how context transforms word representation

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

The Polysemy Problem Revisited 🤔

Recall: Static embeddings assign ONE vector per word

Example: "bank"

  1. "I deposited money at the bank"
    (financial institution)
  2. "We sat by the river bank"
    (riverside)
  3. "The plane will bank left"
    (tilt/turn)

Word2Vec/GloVe: All three get the SAME vector!

Contextual embeddings: Each gets a DIFFERENT vector based on context

Cosine Similarity Demo:


1# Static embeddings (Word2Vec)
2sim(bank_sent1, bank_sent2) = 1.0  # Same!
3
4# Contextual embeddings (BERT)
5sim(bank_sent1, bank_sent2) = 0.45
6sim(bank_sent1, bank_sent3) = 0.32
7sim(bank_sent2, bank_sent3) = 0.28
8
9# "bank" (financial) is closer to
10# "bank" (river) than to "bank" (tilt)
11# because nouns are more similar!
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Static vs. Contextual Embeddings 🔄

Static (Word2Vec, GloVe, FastText):


1# Same vector every time
2model = Word2Vec(...)
3vec1 = model['bank']
4vec2 = model['bank']
5
6assert vec1 == vec2  # True!

Characteristics:

  • One vector per word type
  • Context-independent
  • Fast lookup (dictionary)
  • Fixed after training
  • Polysemy conflation

Contextual (ELMo, BERT):


1# Different vector per occurrence
2model = BertModel.from_pretrained('bert-base')
3
4sent1 = "river bank"
5sent2 = "money bank"
6
7vec1 = get_embedding(model, sent1, 'bank')
8vec2 = get_embedding(model, sent2, 'bank')
9
10assert vec1 != vec2  # True!

Characteristics:

  • Different vector per occurrence
  • Context-dependent
  • Requires forward pass
  • Dynamic representations
  • Handles polysemy naturally
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Language Models as Feature Extractors 🧬

Key Insight: Train a language model, use its internal states as embeddings

Language Modeling Task:

Predict the next word given previous words:

Worked Example

Input: "The cat sat on the ___"

Model predicts probabilities:
"mat" 0.25
"floor" 0.18
"couch" 0.12
"dog" 0.001

To predict well, the model learns: "sat on the" suggests a surface!

Why This Works:

  • To predict "mat", model must encode that "sat on" precedes surfaces
  • Hidden states capture this contextual understanding
  • We extract these rich hidden states as embeddings!
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

ELMo: Embeddings from Language Models 🔮

The first widely-adopted contextual embedding (2018)

Key Ideas:

  1. Train deep bidirectional language model
  2. Use all layer activations
  3. Weighted combination per task
  4. Pre-train on large corpus

Architecture:

  • 2-layer biLSTM
  • Forward LM:
  • Backward LM:
  • Character-based input (handles OOV!)

Bidirectional Processing:


1Forward:  The → cat → sat → ...
2Backward: ... ← sat ← cat ← The

Each word gets info from BOTH directions!

Layer Weighting Example:

For sentiment task, ELMo might learn:

  • Layer 0 (characters): weight = 0.1
  • Layer 1 (syntax): weight = 0.3
  • Layer 2 (semantics): weight = 0.6

Higher layers matter more for meaning!

Reference: Peters et al. (2018). "Deep contextualized word representations"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

ELMo: How It Works 🔍

Training:

  1. Pre-train on large corpus (1B Word Benchmark)
  2. Each position gets representation from all layers

Usage (downstream tasks):

  1. Freeze ELMo weights
  2. For each token, extract representations from all layers
  3. Learn task-specific weighted combination
  4. Concatenate with task model
Concrete Example: Sentiment Analysis

Input: "The movie was absolutely terrible"

Token Char Emb Layer 1 Layer 2 Weighted Sum
terrible [0.1, ...] [0.3, ...] [-0.8, ...] [-0.5, ...]

The final representation captures that "terrible" is strongly negative in this context!

Impact: Improved state-of-the-art on 6 NLP tasks!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

ELMo in Practice 💻


1from allennlp.modules.elmo import Elmo, batch_to_ids
2
3# Initialize ELMo
4options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
5weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"
6
7elmo = Elmo(options_file, weight_file, 2, dropout=0)
8
9# Prepare sentences
10sentences = [
11    ['I', 'deposited', 'money', 'at', 'the', 'bank'],
12    ['We', 'sat', 'by', 'the', 'river', 'bank']
13]
14
15# Convert to character ids
16character_ids = batch_to_ids(sentences)
17
18# Get embeddings
19embeddings = elmo(character_ids)
20
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

ELMo in Practice 💻


21# embeddings['elmo_representations'] contains:
22# - List of 2 tensors (one per layer)
23# - Shape: [batch_size, seq_len, 1024]
24
25# Different vectors for "bank"!
26bank1 = embeddings['elmo_representations'][0][0, 5, :]  # first sentence
27bank2 = embeddings['elmo_representations'][0][1, 5, :]  # second sentence
28
29# Cosine similarity will be lower than for static embeddings
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Universal Sentence Encoder (USE) 🌐

Sentence-level embeddings for semantic similarity

Motivation:

  • Word embeddings: good for words
  • But what about sentences?
  • Average of word vectors? Too simple!
  • Need compositionality

Two Variants:

1. Transformer-based:

  • Higher accuracy
  • Slower (compute intensive)
  • Better for quality

2. Deep Averaging Network (DAN):

  • Lower accuracy
  • Much faster
  • Better for scale

Training Objectives:

  1. Unsupervised: Skip-thought
  2. Supervised: SNLI (entailment)
  3. Multi-task learning

Output:

  • 512-dimensional vector
  • Fixed-length (any sentence length)
  • Optimized for similarity tasks

Use Cases:

  • Semantic search
  • Question answering
  • Text classification
  • Clustering
  • Duplicate detection

Reference: Cer et al. (2018). "Universal Sentence Encoder"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Universal Sentence Encoder in Practice 💻


1import tensorflow_hub as hub
2import numpy as np
3
4# Load model
5embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
6
7# Example sentences
8sentences = [
9    "The cat sat on the mat.",
10    "A feline rested on the rug.",
11    "The dog ran in the park.",
12    "I love machine learning."
13]
14
15# Generate embeddings
16embeddings = embed(sentences)
17
18# Shape: [4, 512]
19print(embeddings.shape)
20
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Universal Sentence Encoder in Practice 💻


21# Compute similarity
22from sklearn.metrics.pairwise import cosine_similarity
23
24sim_matrix = cosine_similarity(embeddings)
25print(sim_matrix)
26
27# Sentences 1 and 2 should be very similar (paraphrases)
28# Sentence 3 somewhat similar (animals)
29# Sentence 4 dissimilar
30
31# Use for semantic search
32query = "cat on mat"
33query_embedding = embed([query])
34similarities = cosine_similarity(query_embedding, embeddings)[0]
35most_similar_idx = np.argmax(similarities)
36print(f"Most similar: {sentences[most_similar_idx]}")
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

BERT: Bidirectional Encoder Representations 🎭

The model that changed everything (2018)

Key Innovations:

  1. context (not just left-to-right)

  2. architecture (attention)

  3. pre-training

  4. Deeply bidirectional

Impact:

  • SOTA on 11 NLP tasks
  • Sparked the "BERT-era"
  • 1000+ variants (RoBERTa, ALBERT, DistilBERT, ...)
  • Foundation for modern LLMs

Architecture Sizes:

BERT-Large 24 1024

Training Data:

  • BooksCorpus (800M words)
  • English Wikipedia (2.5B words)
  • Total: 3.3B words

Training Time:

  • 4 days on 64 TPU chips
  • Or 4 weeks on 8 GPUs

Reference: Devlin et al. (2018). "BERT: Pre-training of Deep Bidirectional Transformers"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Masked Language Modeling (MLM) 🎭

BERT's key training innovation

The Problem with Traditional LM:

  • Left-to-right: Only sees previous words
  • Right-to-left: Only sees next words
  • Want: See both directions simultaneously
  • But: Can't just show the answer during training!

Solution: Mask some words, predict them

Worked Example: MLM Training

Original: "The cat sat on the mat"

Step 1: Randomly select 15% of tokens → "cat" selected

Step 2: Apply masking strategy (80/10/10 rule):

  • 80% chance: "The [MASK] sat on the mat"
  • 10% chance: "The dog sat on the mat" (random word)
  • 10% chance: "The cat sat on the mat" (unchanged)

Step 3: Model sees full context both ways to predict "cat":


1← The [MASK] sat on the mat →
2
3   predict "cat"

Training Procedure:

  1. Randomly select 15% of tokens
  2. Replace 80% with [MASK], 10% with random word, 10% unchanged
  3. Predict original tokens
  4. Bidirectional context!
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Next Sentence Prediction (NSP) 🔗

Second pre-training task: Understand sentence relationships

Task: Given two sentences A and B, predict if B follows A in the text

Positive Example (IsNext)

Sentence A: "The cat sat on the mat."

Sentence B: "It was sleeping peacefully."

Label: IsNext ✓

Negative Example (NotNext)

Sentence A: "The cat sat on the mat."

Sentence B: "Machine learning is fascinating."

Label: NotNext ✗

Why This Helps:

  • Captures discourse relationships
  • Useful for QA, NLI, etc.
  • Models sentence-level coherence
  • Note: Later research (RoBERTa) found NSP less important than MLM
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

BERT Architecture 🏗️

Three types of embeddings are summed for each token:

Worked Example: Input Representation

Input: "[CLS] I love NLP [SEP] It is fun [SEP]"

Token Token ID Segment Position Final Embedding
[CLS] E_CLS A 0 E_CLS + E_A + E_0
I E_I A 1 E_I + E_A + E_1
love E_love A 2 E_love + E_A + E_2
NLP E_NLP A 3 E_NLP + E_A + E_3
[SEP] E_SEP A 4 E_SEP + E_A + E_4
It E_It B 5 E_It + E_B + E_5
is E_is B 6 E_is + E_B + E_6
fun E_fun B 7 E_fun + E_B + E_7
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

BERT Architecture 🏗️

Token Token ID Segment Position Final Embedding
[SEP] E_SEP B 8 E_SEP + E_B + E_8
...continued
  • Token Embedding: What word is this?
  • Segment Embedding: Which sentence (A or B)?
  • Position Embedding: Where in the sequence?
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

BERT in Practice 💻


1from transformers import BertTokenizer, BertModel
2import torch
3
4# Load pre-trained BERT
5tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
6model = BertModel.from_pretrained('bert-base-uncased')
7
8# Example sentences with "bank"
9sent1 = "I deposited money at the bank"
10sent2 = "We sat by the river bank"
11
12# Tokenize
13tokens1 = tokenizer(sent1, return_tensors='pt')
14tokens2 = tokenizer(sent2, return_tensors='pt')
15
16# Get embeddings
17with torch.no_grad():
18    output1 = model(**tokens1)
19    output2 = model(**tokens2)
20
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

BERT in Practice 💻


21# Last hidden state: [batch_size, seq_len, hidden_size]
22embeddings1 = output1.last_hidden_state
23embeddings2 = output2.last_hidden_state
24
25# Extract "bank" embedding (position varies)
26# tokens1: [CLS] i deposited money at the bank [SEP]
27bank1_embedding = embeddings1[0, 6, :]  # 768-dim vector
28
29# tokens2: [CLS] we sat by the river bank [SEP]
30bank2_embedding = embeddings2[0, 6, :]  # 768-dim vector
31
32# Different vectors for "bank"!
33from torch.nn.functional import cosine_similarity
34sim = cosine_similarity(bank1_embedding, bank2_embedding, dim=0)
35print(f"Similarity: {sim:.3f}")  # Lower than with static embeddings
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Fine-tuning BERT 🎯

Two ways to use BERT:

1. Feature Extraction:

  • Freeze BERT weights
  • Use embeddings as features
  • Train classifier on top
  • Faster, less data needed

1# Freeze BERT
2for param in bert_model.parameters():
3    param.requires_grad = False
4
5# Add classifier
6classifier = nn.Linear(768, num_classes)
7
8# Train only classifier
9optimizer = Adam(classifier.parameters())

2. Fine-tuning:

  • Update BERT weights
  • Add task-specific head
  • Train end-to-end
  • Better performance, more data needed

1# Keep BERT trainable
2bert_model = BertModel.from_pretrained(
3    'bert-base-uncased'
4)
5
6# Add classifier
7classifier = nn.Linear(768, num_classes)
8
9# Train everything
10optimizer = Adam(
11    list(bert_model.parameters()) +
12    list(classifier.parameters()),
13    lr=2e-5  # Small learning rate!
14)

Best Practice: Fine-tune with small learning rate (1e-5 to 5e-5) for few epochs (2-4)

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Contextual Embeddings Comparison 📊

Pre-training LM (forward+backward) Multi-task MLM + NSP
Bidirectional Shallow Yes Deep
Granularity Token Sentence Token
Hidden size 1024 512 768/1024
Parameters 93M 256M 110M/340M
Speed Medium Fast Slow
OOV handling Characters Subwords WordPiece
Year 2018 2018 2018
Recommendations
  • ELMo: Legacy systems, character-aware needs
  • USE: Sentence similarity, semantic search, fast inference
  • BERT: State-of-the-art quality, most NLP tasks (now superseded by larger models)
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Impact on NLP 🌟

BERT revolutionized NLP:

Before BERT (pre-2018):

  • Task-specific architectures
  • Train from scratch
  • Static embeddings (Word2Vec, GloVe)
  • Limited transfer learning
  • Moderate performance

Tasks that improved:

  • Question Answering (+1.5 F1 on SQuAD)
  • NER (+0.3 F1)
  • Sentiment Analysis (+2% accuracy)
  • NLI (+4% accuracy)
  • Many others!

After BERT (post-2018):

  • Pre-train, then fine-tune
  • Transfer learning standard
  • Contextual embeddings
  • Massive pre-trained models
  • State-of-the-art results

BERT Variants:

  • RoBERTa: Better training
  • ALBERT: Parameter sharing
  • DistilBERT: Smaller, faster
  • ELECTRA: Different pre-training
  • DeBERTa: Disentangled attention
  • And 100+ more!
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Real-World Applications 🚀

1. Google Search:


1Query: "can you get medicine for
2        someone pharmacy"
3
4BERT understands: picking up a
5prescription FOR someone else
6
7Before BERT: matched "medicine"
8and "pharmacy" keywords only

2. Question Answering:


1Context: "The Eiffel Tower was
2built in 1889 by Gustave Eiffel."
3
4Q: "When was the Eiffel Tower built?"
5A: "1889" ← BERT extracts this span

3. Sentiment Analysis:


1# Fine-tuned BERT
2text = "Not bad at all!"
3prediction = model(text)
4# → Positive (understands negation!)

4. Named Entity Recognition:


1Input: "Apple CEO Tim Cook announced..."
2
3Output:
4  Apple     → ORG
5  Tim Cook  → PERSON

5. Semantic Search:


1Query: "affordable laptop for students"
2Matches: "budget-friendly notebook
3          for college" ← synonyms!
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Discussion Question 💬

Do contextual embeddings truly "understand" language?

Consider:

  • BERT can distinguish "bank" (financial) from "bank" (riverside)
  • It achieves human-level performance on many benchmarks
  • But it's trained only on text co-occurrence patterns

Arguments For:

  • Captures complex semantic relationships
  • Generalizes to new contexts
  • Emergent linguistic capabilities
  • Handles compositional meaning

Arguments Against:

  • No grounding in physical world
  • No common sense reasoning
  • Exploits statistical shortcuts
  • Brittle to adversarial examples
  • "Stochastic parrots"?
The Grounding Problem Persists

Even with contextual embeddings, we still lack true grounding in experience, perception, and embodied cognition.

Reference: Bender & Koller (2020). "Climbing towards NLU: On Meaning, Form, and Understanding"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Practical Tips 💡

  1. Choosing a Model:
    • BERT-base: Good balance, 110M params
  • DistilBERT: 40% smaller, 60% faster, 97% performance
  • RoBERTa: Better than BERT, longer training
  • Domain-specific: BioBERT, SciBERT, FinBERT, etc.
  1. Fine-tuning Best Practices:
    • Small learning rate (2e-5 typical)
  • Few epochs (2-4)
  • Batch size: 16 or 32
  • Warm-up steps
  • Gradient clipping
  1. Computational Considerations:
    • BERT-base: ~110M params, 512 max tokens
  • Needs GPU (1-4 GB VRAM minimum)
  • Batching for efficiency
  • Consider DistilBERT for production
  1. Using HuggingFace:
    • Easy access to 1000+ pre-trained models
  • Standardized API
  • Good documentation and community
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Summary 🎯

What we learned today:

  1. Contextual vs. Static: Different vectors per occurrence
  2. ELMo (2018):
    • BiLSTM language models
  • Character-based, handles OOV
  • Task-specific weighting
  1. Universal Sentence Encoder (2018):
    • Sentence-level embeddings
  • Two variants: Transformer & DAN
  • Optimized for semantic similarity
  1. BERT (2018):
    • Masked language modeling
  • Deep bidirectional transformers
  • Pre-train + fine-tune paradigm
  • Revolutionized NLP
  1. Impact: Established modern transfer learning in NLP
  2. Next: Dimensionality reduction techniques for visualization
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Key References 📚

Foundational Papers:

  • Peters et al. (2018). "Deep contextualized word representations" (ELMo)
  • Cer et al. (2018). "Universal Sentence Encoder"
  • Devlin et al. (2018). "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"

BERT Variants:

  • Liu et al. (2019). "RoBERTa: A Robustly Optimized BERT Pretraining Approach"
  • Sanh et al. (2019). "DistilBERT, a distilled version of BERT"
  • Lan et al. (2019). "ALBERT: A Lite BERT for Self-supervised Learning"

Critical Perspectives:

  • Bender & Koller (2020). "Climbing towards NLU: On Meaning, Form, and Understanding"
  • Bender et al. (2021). "On the Dangers of Stochastic Parrots"

Resources:

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Questions? 🙋

Next Lecture:

Dimensionality Reduction: PCA, t-SNE, UMAP

Visualizing high-dimensional embeddings!

Winter 2026