* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 12: Contextual embeddings

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Understand the polysemy problem and why static embeddings struggle
  2. Explain how ELMo uses bidirectional LSTMs for context
  3. Apply Universal Sentence Encoder for semantic similarity
  4. Understand BERT's masked language modeling approach
  5. Compare contextual embedding methods and their trade-offs

Contextual embeddings changed NLP: same word, different vectors depending on context!

The polysemy problem

Word2Vec, GloVe, and FastText assign the same vector to every occurrence of a word.

  1. "I deposited money at the bank" (financial institution)
  2. "We sat by the river bank" (riverside)
  3. "The plane will bank left" (tilt/turn)

Static embeddings: All three get the SAME vector!

Contextual embeddings: Each gets a DIFFERENT vector based on context.

Static embeddings (Word2Vec, GloVe, FastText)


1model = Word2Vec(...)
2vec1 = model['bank']
3vec2 = model['bank']
4assert vec1 == vec2  # Always True!
  • One vector per word type — lookup table
  • Context-independent — ignores surrounding words
  • Fast — dictionary lookup, no computation

Contextual embeddings (ELMo, BERT)


1sent1 = "river bank"
2sent2 = "money bank"
3vec1 = get_embedding(model, sent1, 'bank')
4vec2 = get_embedding(model, sent2, 'bank')
5assert vec1 != vec2  # Always True!
  • Different vector per occurrence — context-dependent
  • Bidirectional context — sees words before AND after
  • Slower — requires forward pass through neural network

Language models as feature extractors

Train a language model, use its internal states as embeddings!

Fundamental goal: predict the next word given previous words: P(wtw1,w2,...,wt1)P(w_t | w_1, w_2, ..., w_{t-1})

Input: "The cat sat on the ___"

Model predicts: "mat" (0.25), "floor" (0.18), "couch" (0.12), "dog" (0.001)

To predict well, the model learns: "sat on the" suggests a surface!

Hidden states capture contextual understanding. We extract these rich hidden states as embeddings!

ELMo: Embeddings from Language Models

  • Train deep bidirectional language model
  • Use all layer activations
  • Weighted combination per task
  • Character-based input (handles OOV!)

ELMo architecture

  • Forward LM: P(wtw1...wt1)P(w_t | w_1...w_{t-1})
  • Backward LM: P(wtwt+1...wn)P(w_t | w_{t+1}...w_n)

Each word gets info from BOTH directions!

For sentiment analysis, ELMo might learn:

  • Layer 0 (characters): weight = 0.1
  • Layer 1 (syntax): weight = 0.3
  • Layer 2 (semantics): weight = 0.6

Higher layers matter more for meaning!

Improved state-of-the-art on many NLP tasks: question answering, sentiment analysis, named entity recognition, coreference resolution, semantic role labeling, and more!

ELMo in Python


1from allennlp.modules.elmo import Elmo, batch_to_ids
2
3# Initialize ELMo (need to download these files first)
4options_file = "elmo_options.json"
5weight_file = "elmo_weights.hdf5"
6elmo = Elmo(options_file, weight_file, 2, dropout=0)
7
8# prepare inputs and get embeddings
9sentences = [ 
10    ['I', 'deposited', 'money', 'at', 'the', 'bank'],
11    ['We', 'sat', 'by', 'the', 'river', 'bank']
12]
13character_ids = batch_to_ids(sentences)
14embeddings = elmo(character_ids)
15
16# Different vectors for "bank"!
17bank1 = embeddings['elmo_representations'][0][0, 5, :]  # first sentence
18bank2 = embeddings['elmo_representations'][0][1, 5, :]  # second sentence

Universal Sentence Encoder (USE)

Cer et al. (2018, EMNLP) Universal Sentence Encoder.

  • Produces 512-dimensional vector for any sentence
  • Two variants: Transformer (higher accuracy) and Deep Averaging Network (DAN; faster)
  • Optimized for semantic similarity tasks
  • Performs really well in practice (still widely used today!)

Semantic search, question answering, text classification, clustering, duplicate detection.

Universal Sentence Encoder in Python


1import tensorflow_hub as hub
2import numpy as np
3from sklearn.metrics.pairwise import cosine_similarity
4
5# Load model
6embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
7
8sentences = [
9    "The cat sat on the mat.",
10    "A feline rested on the rug.",  # Paraphrase!
11    "The dog ran in the park.",
12    "I love machine learning."
13]
14
15embeddings = embed(sentences) # shape: [4, 512]
16
17# Compute similarity - sentences 1 and 2 should be very similar
18sim_matrix = cosine_similarity(embeddings)

BERT: Bidirectional Encoder Representations

Viswani et al. (2017, NIPS) Attention is All You Need.

Devlin, Chang, Lee, Toutanova (2019, NAACL) BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding.

We will cover Transformers in detail later. For now: it's a very powerful architecture for sequence modeling. The core idea is self-attention: each word "attends to" (i.e., is affected by) all other words in the sequence, capturing rich dependencies.

  • Bidirectional context (like ELMo, but captures much deeper patterns due to Transformer)
  • Massive pre-training (3.3B word corpus)
  • Two pre-training tasks: Masked Language Modeling (MLM) & Next Sentence Prediction (NSP)
  • Fine-tunable for many downstream tasks

SOTA on 11 NLP tasks. Sparked the "BERT-era" with 1000+ variants (RoBERTa, ALBERT, DistilBERT, ...). We've already used some of these (e.g., DistilBERT for sentiment analysis)!

Masked Language Modeling (MLM)

  • Left-to-right: only sees previous words
  • Right-to-left: only sees next words
  • Ideally we would see both directions simultaneously...
  • ...but we can't just show the answer during training!

Original: "The cat sat on the mat"
Masked: "The [MASK] sat on the mat"
Task: Predict "cat" using context from BOTH sides!

MLM training procedure

Step 1: Randomly select 15% of tokens → "cat" selected

Step 2: Apply masking strategy (80/10/10 rule):

  • 80%: "The [MASK] sat on the mat"
  • 10%: "The cat sat on the mat" (unchanged)
  • 10%: "The answer sat on the mat" (random word)

Step 3: Model predicts "cat" using full bidirectional context

Prevents model from only learning to predict [MASK] tokens. Forces it to make good predictions for any token.

Next Sentence Prediction (NSP)

Given two sentences A and B, predict if B follows A in the text.

A: "The cat sat on the mat."
B: "It was sleeping peacefully."
Label: IsNext

A: "The cat sat on the mat."
B: "Machine learning is fascinating."
Label: NotNext

Later research (RoBERTa) found NSP less important than MLM for most tasks.

BERT input representation

Input: "[CLS] I love NLP [SEP] It is fun [SEP]"

Token Token Emb Segment Position
[CLS] E_CLS A 0
I E_I A 1
love E_love A 2
NLP E_NLP A 3
[SEP] E_SEP A 4
It E_It B 5
is E_is B 6
fun E_fun B 7

Final embedding = Token + Segment + Position

BERT in Python


1from transformers import BertTokenizer, BertModel
2import torch
3
4tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
5model = BertModel.from_pretrained('bert-base-uncased')
6
7sent1, sent2 = "I deposited money at the bank", "We sat by the river bank"
8
9# Tokenize and get embeddings
10tokens1 = tokenizer(sent1, return_tensors='pt')
11tokens2 = tokenizer(sent2, return_tensors='pt')
12
13with torch.no_grad():
14    output1 = model(**tokens1)
15    output2 = model(**tokens2)
16
17# Extract "bank" embedding - different vectors!
18bank1 = output1.last_hidden_state[0, 6, :]  # 768-dim
19bank2 = output2.last_hidden_state[0, 6, :]  # 768-dim

Fine-tuning BERT

Fine-tuning: starting with pre-trained model weights, then updating them on a specific downstream task (e.g., sentiment analysis, question answering).

  • Leverages knowledge from massive pre-training
  • Adapts model to specific task/domain
  • Much faster than training from scratch

Fine-tuning BERT


1for param in bert_model.parameters():
2    param.requires_grad = False
3classifier = nn.Linear(768, num_classes)
4# Train only classifier

1bert_model = BertModel.from_pretrained('bert-base-uncased')
2classifier = nn.Linear(768, num_classes)
3optimizer = Adam([...], lr=2e-5)  # Small learning rate!
4# Train everything end-to-end

Fine-tune with small learning rate (1e-5 to 5e-5) for few epochs (2-4).

Real-world applications

Query: "can you get medicine for someone pharmacy"

BERT understands: picking up a prescription FOR someone else (not just keyword matching!)

Context: "The Eiffel Tower was built in 1889 by Gustave Eiffel."
Q: "When was the Eiffel Tower built?"
A: "1889" ← BERT extracts this span

"Not bad at all!" → Positive (understands negation!)

"Apple is looking at buying U.K. startup for $1 billion" → Recognizes "Apple" as ORG, "U.K." as LOC, "$1 billion" as MONEY.

Discussion: do contextual embeddings reflect "understanding"?

  • Capture complex semantic relationships
  • Generalize to new contexts
  • Handle compositional meaning (e.g., idioms, metaphors, sarcasm)
  • Human-level performance on many benchmarks
  • Brittle to adversarial examples (small changes can fool them)
  • Lack of world knowledge and reasoning: grammatical but nonsensical sentences can be rated as highly probable
  • Struggle with out-of-distribution inputs that humans handle easily (e.g., novel metaphors, jokes)

Notice that with these models we are far beyond pre-defined symbolic rules and simple co-occurrence statistics. They capture a lot of nuance in language use that was previously out of reach. Arguing "where models fall short of true understanding" is (increasingly) becoming a moving target, and increasingly subtle.

Lack of grounding in actual experience means that these models cannot distinguish between reasonable vs. unreasonable statements that happen to be linguistically well-formed. Whatever "understanding" these models might be said to have is fundamentally different from human understanding: it is based purely on patterns in text, without any connection to real-world referents or experiences.

Summary

  1. Contextual vs. static: Different vectors per occurrence solves polysemy
  2. ELMo (2018): BiLSTM language models, character-based, task-specific weighting
  3. USE (2018): Sentence-level embeddings optimized for similarity
  4. BERT (2018): Masked language modeling, deep bidirectional transformers
  5. Impact: Established modern transfer learning paradigm in NLP

Questions?

📧 Email me
💬 Join our Discord
💁 Come to office hours

Dimensionality reduction techniques (PCA, t-SNE, UMAP) for visualizing embeddings!