* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication

Lecture 18: BERT Deep Dive

Week 6, Lecture 1 - Bidirectional Encoder Representations

PSYC 51.07: Models of Language and Communication

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication

Today's Agenda 📋

  1. 🎯 BERT Introduction: What makes it special?
  2. 🎭 Masked Language Modeling: The key training objective
  3. 🏗️ BERT Architecture: Model sizes and specifications
  4. 📊 Pre-training & Fine-tuning: The two-stage paradigm
  5. 🔬 Contextual Embeddings: Seeing polysemy in action
  6. 💻 Using BERT: Practical code examples

Goal: Deep understanding of BERT and how it revolutionized NLP

Winter 2026
PSYC 51.07: Models of Language and Communication

BERT: Bidirectional Encoder Representations 🎯

BERT = Encoder-only Transformer

Key Innovation: Masked Language Modeling (MLM)

Traditional Language Models:

  • Left-to-right (GPT)

  • Or right-to-left

  • Can't see full picture

Example:

  • "The cat sat on the ___"
  • Only sees left context

BERT (MLM):

  • Mask random tokens
  • Predict from
  • Sees both left & right
  • Deeper understanding

Example:

  • "The cat [MASK] on the mat"
  • Sees: "The cat" AND "on the mat"
  • Predicts: "sat"

Reference: Devlin et al. (2019) - "BERT: Pre-training of Deep Bidirectional Transformers"

Winter 2026
PSYC 51.07: Models of Language and Communication

Why BERT Was Revolutionary 🚀

Before BERT (2018):

  • Feature-based approaches (use Word2Vec/GloVe as features)
  • Task-specific architectures
  • Limited transfer learning
  • Unidirectional or shallow bidirectional models

BERT's Contributions:

  1. Deep Bidirectionality
    • True bidirectional context at every layer
  • Not just concatenating left-to-right and right-to-left
  1. Pre-train + Fine-tune Paradigm
    • Single pre-trained model for all tasks
  • Fine-tune with minimal architecture changes
  • Democratized NLP (no need to train from scratch!)
  1. State-of-the-Art Results
    • Beat previous best on 11 NLP tasks
  • Large performance gains (sometimes 10+ points!)
  • Showed power of pre-training
Winter 2026
PSYC 51.07: Models of Language and Communication

Masked Language Modeling (MLM) 🎭

BERT's Pre-training Objective

Training Procedure:

  1. Take a sentence
  2. Randomly mask 15% of tokens
  3. Of the masked tokens:
    • 80%: Replace with [MASK]
  • 10%: Replace with random word
  • 10%: Keep unchanged
  1. Predict the original tokens
Example

Original: "My dog is hairy"

Masked: "My dog is [MASK]"

Labels: [-, -, -, hairy]

Prediction: Model predicts "hairy" using bidirectional context

Why the 80/10/10 split?

  • Prevents overfitting to [MASK] token
  • Forces model to maintain representations for all tokens
Winter 2026
PSYC 51.07: Models of Language and Communication

MLM Example: Step by Step 📝

Sentence: "The quick brown fox jumps over the lazy dog"

Step 1: Select Tokens (15%)
9 tokens total, mask ~1-2


1tokens = ["The", "quick", "brown", "fox",
2          "jumps", "over", "the", "lazy", "dog"]
3# Randomly select: "quick" (idx 1), "over" (idx 5)

Step 2: Apply 80/10/10 Strategy


1"quick"  80%  [MASK]
2"over"   10%  "under" (random)

Step 3: Create Training Example


1input:  "The [MASK] brown fox jumps
2         under the lazy dog"
3labels: [-1, "quick", -1, -1, -1,
4         "over", -1, -1, -1]
5# -1 = no loss computed

Step 4: Model Predicts


1P("quick" | context)  high (adjective slot)
2P("over" | context)   high (preposition slot)

Key insight: Model must understand syntax AND semantics to predict masked words!

Winter 2026
PSYC 51.07: Models of Language and Communication

Next Sentence Prediction (NSP) 🔗

BERT's second pre-training objective (debated usefulness)

Task: Given two sentences A and B, predict if B follows A

Positive Example (IsNext)

Sentence A: "The man went to the store."

Sentence B: "He bought a gallon of milk."

Label: IsNext ✓

Negative Example (NotNext)

Sentence A: "The man went to the store."

Sentence B: "Penguins are flightless birds."

Label: NotNext ✗

Implementation:

  • Use [CLS] token representation for classification
  • 50% real pairs, 50% random pairs
  • Binary classification task

Note: Later work (RoBERTa) showed NSP might not be necessary!

Winter 2026
PSYC 51.07: Models of Language and Communication

BERT Architecture Variants 📊

BERT-Large 24 1024 16 340M

Architecture Details (BERT-Base):

  • 12 transformer encoder layers
  • 768-dimensional hidden states
  • 12 attention heads per layer (64 dims each)
  • 3072-dimensional feed-forward intermediate size (4x expansion)
  • Maximum sequence length: 512 tokens
  • Vocabulary size: 30,000 WordPiece tokens

Special Tokens:

  • [CLS]: Classification token (first token, used for sequence-level tasks)
  • [SEP]: Separator token (between sentences)
  • [MASK]: Mask token (for MLM)
  • [PAD]: Padding token (for variable-length sequences)
Winter 2026
PSYC 51.07: Models of Language and Communication

BERT Input Representation 🔤

Three types of embeddings are summed:


1# Example: Sentence pair for NSP
2sentence_a = "My dog is cute"
3sentence_b = "He likes playing"
4
5# Tokenization
6tokens = ["[CLS]", "my", "dog", "is", "cute", "[SEP]", "he", "likes", "playing", "[SEP]"]
7
8# Three embedding types (each is a 768-dim vector):
9token_emb   = [E_CLS, E_my, E_dog, E_is, E_cute, E_SEP, E_he, E_likes, E_playing, E_SEP]
10segment_emb = [E_A,   E_A,  E_A,   E_A,  E_A,    E_A,   E_B,  E_B,     E_B,       E_B   ]
11position_emb= [E_0,   E_1,  E_2,   E_3,  E_4,    E_5,   E_6,  E_7,     E_8,       E_9   ]
12
13# Final input = token + segment + position (element-wise sum)
14input_embedding = token_emb + segment_emb + position_emb

Three embedding types:

  1. Token Embeddings: WordPiece vocabulary (30K learned vectors)
  2. Segment Embeddings: Which sentence (A or B)? (2 learned vectors)
  3. Position Embeddings: Learned position 0-511 (512 learned vectors)
Winter 2026
PSYC 51.07: Models of Language and Communication

WordPiece Tokenization: Worked Example 🔤

How BERT handles unknown words


1from transformers import BertTokenizer
2tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
3
4# Common words stay intact
5tokenizer.tokenize("The cat sat on the mat")
6# → ['the', 'cat', 'sat', 'on', 'the', 'mat']
7
8# Rare/unknown words get split into subwords
9tokenizer.tokenize("unbelievably")
10# → ['un', '##believable', '##ly']  # "##" means continuation
11
12tokenizer.tokenize("ChatGPT is transformative")
13# → ['chat', '##g', '##pt', 'is', 'transform', '##ative']
Why WordPiece?
  • No OOV problem: Any word can be represented as subwords
  • Morphology: Learns word parts (prefixes, suffixes, stems)
  • Compact vocabulary: 30K tokens cover most text
  • Trade-off: Rare words take more tokens (longer sequences)
Winter 2026
PSYC 51.07: Models of Language and Communication

BERT Pre-training 🏋️

Massive scale pre-training on unlabeled text

Pre-training Data:

  • BooksCorpus: 800M words (novels, fiction)
  • English Wikipedia: 2,500M words
  • Total: 3.3 billion words
  • Diverse, high-quality text

Training Details:

  • Batch size: 256 sequences (128,000 tokens)
  • Training steps: 1M steps
  • Optimization: Adam (lr=1e-4, warmup=10k steps)
  • Hardware: 4-16 Cloud TPUs
  • Training time: 4 days (BERT-Base), 4+ days (BERT-Large)
Key Insight

Pre-training learns general language understanding that transfers to many downstream tasks!

Winter 2026
PSYC 51.07: Models of Language and Communication

Fine-tuning BERT 🎓

Two-stage process: Pre-train then Fine-tune

Stage 1: Pre-training (done once)


1# Expensive: weeks on TPUs
2# Data: 3.3B words (books + Wikipedia)
3# Task: MLM + NSP
4# Result: General language understanding
5
6model = pretrain_bert(
7    data=["BooksCorpus", "Wikipedia"],
8    steps=1_000_000,
9    hardware="16 TPUs"
10)

Stage 2: Fine-tuning (per task)


1# Cheap: hours on single GPU
2# Data: 1K-100K labeled examples
3# Task: Your specific task
4# Result: Task-specific model
5
6model = load_pretrained("bert-base")
7model.add_classifier(num_labels=2)
8model.train(
9    task_data,
10    epochs=3,
11    lr=2e-5  # Small learning rate!
12)

Benefits: Pre-training captures language; fine-tuning adapts to your task

Winter 2026
PSYC 51.07: Models of Language and Communication

Fine-tuning for Different Tasks 🎯

Minimal architecture changes needed!

  1. Single Sentence Classification
    • Input: [CLS] sentence [SEP]
  • Output: [CLS] representation → classifier
  • Example: Sentiment analysis
  1. Sentence Pair Classification
    • Input: [CLS] sentence A [SEP] sentence B [SEP]
  • Output: [CLS] representation → classifier
  • Example: Natural Language Inference
  1. Question Answering
    • Input: [CLS] question [SEP] passage [SEP]
  • Output: Token-level predictions for start/end positions
  • Example: SQuAD
  1. Token Classification
    • Input: [CLS] sentence [SEP]
  • Output: Each token representation → classifier
  • Example: Named Entity Recognition
Winter 2026
PSYC 51.07: Models of Language and Communication

Fine-tuning BERT: Code Example 💻

Using HuggingFace Transformers


1from transformers import BertForSequenceClassification, Trainer, TrainingArguments
2
3# Load pre-trained BERT with classification head
4model = BertForSequenceClassification.from_pretrained(
5    'bert-base-uncased',
6    num_labels=2  # Binary classification
7)
8
9# Define training arguments
10training_args = TrainingArguments(
11    output_dir='./results',
12    num_train_epochs=3,
13    per_device_train_batch_size=16,
14    learning_rate=2e-5,
15    warmup_steps=500,
16)
17
18# Train
19trainer = Trainer(
20    model=model,
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

Fine-tuning BERT: Code Example 💻


21    args=training_args,
22    train_dataset=train_dataset,
23    eval_dataset=eval_dataset,
24)
25
26trainer.train()
...continued

Reference: HuggingFace Course - Chapters 1.5, 7.3

Winter 2026
PSYC 51.07: Models of Language and Communication

Contextual Embeddings in Action 🔬

Remember "bank"? Let's see BERT handle it!


1from transformers import BertTokenizer, BertModel
2import torch
3
4tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
5model = BertModel.from_pretrained('bert-base-uncased')
6
7# Two different contexts for "bank"
8sent1 = "I deposited money at the bank"
9sent2 = "We sat by the river bank"
10
11# Get embeddings
12def get_embedding(sentence, target_word):
13    inputs = tokenizer(sentence, return_tensors='pt')
14    outputs = model(**inputs)
15    # Find position of target word
16    tokens = tokenizer.tokenize(sentence)
17    idx = tokens.index(target_word) + 1  # +1 for [CLS]
18    return outputs.last_hidden_state[0, idx, :]
19
20emb1 = get_embedding(sent1, "bank")  # Financial bank
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication

Contextual Embeddings in Action 🔬


21emb2 = get_embedding(sent2, "bank")  # River bank
22
23# Compare similarity
24similarity = torch.cosine_similarity(emb1, emb2, dim=0)
25print(f"Similarity: {similarity:.3f}")  # Low! (~0.3-0.5)
26# Different contexts → Different embeddings!
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication

Visualizing BERT's Contextual Embeddings 📊

Same word, different meanings, different vectors


1# Find nearest neighbors for "bank" in each context
2from sklearn.neighbors import NearestNeighbors
3
4# Financial "bank" context
5neighbors_financial = find_nearest_words(emb1, vocabulary)
6# → ["banks", "financial", "account", "deposit", "loan", "credit"]
7
8# River "bank" context
9neighbors_river = find_nearest_words(emb2, vocabulary)
10# → ["shore", "riverside", "banks", "stream", "water", "edge"]
Concrete Measurements
Word Pair Word2Vec Similarity BERT Similarity
bank (fin) vs bank (river) 1.00 (same vector!) 0.42
bank (fin) vs money 0.65 0.78
bank (river) vs shore 0.52 0.81

BERT captures meaning differences that static embeddings miss!

Winter 2026
PSYC 51.07: Models of Language and Communication

BERT's Impressive Results 📈

State-of-the-art on 11 NLP tasks when released (2018)

SQuAD 2.0 (QA) F1 66.3 83.1
MNLI (NLI) Accuracy 80.6 86.7
SST-2 (Sentiment) Accuracy 93.2 94.9
CoNLL-2003 (NER) F1 92.6 92.8

Key Observations:

  • Largest gains on tasks requiring understanding (QA, NLI)
  • Improvements even on well-studied benchmarks
  • BERT-Large generally better than BERT-Base
  • Fine-tuning is simple but very effective
Impact

BERT made pre-trained transformers the standard approach in NLP. Almost all subsequent models build on BERT's ideas!

Winter 2026
PSYC 51.07: Models of Language and Communication

What Does BERT Learn? 🧠

Probing BERT's internal representations

Research has shown BERT captures:

  1. Syntactic Information
    • Part-of-speech tags
  • Constituent structure
  • Dependency relations
  • Lower layers encode more syntax
  1. Semantic Information
    • Word sense disambiguation
  • Semantic roles
  • Entity types
  • Middle layers encode more semantics
  1. Pragmatic Information
    • Coreference resolution
  • Discourse relations
  • Higher layers encode more pragmatics
  1. World Knowledge
    • Factual knowledge (to some extent)
  • Common sense reasoning (limited)

Reference: Tenney et al. (2019) - "BERT Rediscovers the Classical NLP Pipeline"

Winter 2026
PSYC 51.07: Models of Language and Communication

BERT Layer Analysis 📊

Different layers capture different linguistic properties


1# Probing experiment: Train linear classifiers on each layer's representations
2from transformers import BertModel
3import numpy as np
4
5model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
6
7# Get hidden states for all 12 layers
8outputs = model(**inputs)
9hidden_states = outputs.hidden_states  # (13 layers: embedding + 12 transformer)
10
11# Results from probing studies (Tenney et al., 2019):
12layer_specialization = {
13    "Layers 0-2":  ["POS tagging", "Word boundaries"],     # Surface
14    "Layers 3-6":  ["Parse trees", "Dependencies"],        # Syntax
15    "Layers 7-9":  ["Semantic roles", "Coreference"],      # Semantics
16    "Layers 10-12": ["Task-specific representations"]       # Task
17}

Observations:

  • Lower layers: surface features (word forms, POS)
  • Middle layers: syntax (phrase structure, dependencies)
  • Higher layers: semantics and task-specific features

Similar to CNNs: edges → shapes → objects

Winter 2026
PSYC 51.07: Models of Language and Communication

Discussion Questions 💭

  1. MLM vs. Autoregressive:
    • Why is MLM better for understanding tasks?
  • Can BERT generate text like GPT?
  • What are the trade-offs?
  1. The 80/10/10 Masking Strategy:
    • Why not just use 100% [MASK]?
  • What problem does the random replacement solve?
  • Could we improve this strategy?
  1. Pre-training Data:
    • Why use books and Wikipedia?
  • Would social media text work as well?
  • How does data quality affect pre-training?
  1. Fine-tuning:
    • Why does fine-tuning work so well?
  • When might fine-tuning fail?
  • How much labeled data do we need?
Winter 2026
PSYC 51.07: Models of Language and Communication

Looking Ahead 🔮

Today we learned:

  • BERT architecture and innovations
  • Masked Language Modeling
  • Pre-training and fine-tuning
  • Contextual embeddings
  • What BERT learns

Next lecture (Lecture 16 - BERT Variants):

  • : Optimized BERT training
  • : Parameter-efficient BERT
  • : Smaller, faster BERT
  • : ELECTRA, DeBERTa, and more
  • Comparative analysis and when to use which

BERT started a revolution in NLP! 🚀

Winter 2026
PSYC 51.07: Models of Language and Communication

Summary 🎯

Key Takeaways:

  1. BERT = Encoder-only Transformer
    • Bidirectional self-attention
  • Trained with Masked Language Modeling
  1. Pre-train + Fine-tune Paradigm
    • Expensive pre-training on unlabeled data (once)
  • Cheap fine-tuning on task-specific data (per task)
  1. Contextual Embeddings
    • Different representations based on context
  • Solves polysemy problem
  1. Hierarchical Learning
    • Lower layers: syntax
  • Higher layers: semantics
  • Learns linguistic structure automatically
  1. Revolutionary Impact
    • Established pre-training as standard
  • Democratized NLP research
  • Foundation for modern LLMs
Winter 2026
PSYC 51.07: Models of Language and Communication

References 📚

Essential Papers:

  • Devlin et al. (2019) - "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"

  • The original BERT paper

  • Introduced MLM and NSP

    \item Tenney et al. (2019) - "BERT Rediscovers the Classical NLP Pipeline"

    • Analysis of what BERT learns
  • Layer-wise linguistic properties

    \item Clark et al. (2019) - "What Does BERT Look At? An Analysis of BERT's Attention"

    • Understanding BERT's attention patterns

Tutorials:

  • HuggingFace Course: Chapter 1 (Transformer Models)
  • Jay Alammar: "The Illustrated BERT, ELMo, and co."
  • BertViz: Interactive attention visualization
Winter 2026
PSYC 51.07: Models of Language and Communication

Questions? 🙋

Discussion Time

Topics for discussion:

  • Masked Language Modeling
  • Pre-training vs fine-tuning
  • Contextual embeddings
  • BERT architecture details
  • Implementation questions

Thank you! 🙏

Next: BERT Variants and Improvements!

Winter 2026