* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
PSYC 51.07: Models of Language and Communication - Week 4

Dimensionality Reduction for NLP

Lecture 13: PCA, t-SNE, and UMAP

PSYC 51.07: Models of Language and Communication - Week 4

Winter 2026

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Today's Lecture 📋

  1. 📉 The Curse of Dimensionality
  2. 📊 Principal Component Analysis (PCA)
  3. 🎨 t-SNE: Stochastic Neighbor Embedding
  4. 🗺️ UMAP: Uniform Manifold Approximation
  5. 🔍 Comparison & Best Practices
  6. 💻 Visualization Techniques

Goal: Learn to visualize and explore high-dimensional embeddings

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

The Curse of Dimensionality 📉

The Problem:

Word Embeddings:

  • Word2Vec: 300 dimensions
  • GloVe: 300 dimensions
  • FastText: 300 dimensions
  • BERT: 768 dimensions
  • GPT-3: 12,288 dimensions!

Challenges:

  • Can't visualize 300D space
  • Distances behave strangely
  • Computation expensive
  • Storage requirements
  • Interpretation difficult

Concrete Example: Distance Concentration


1import numpy as np
2
3# Sample 1000 random points in different dims
4for dim in [2, 10, 100, 300]:
5    points = np.random.randn(1000, dim)
6    dists = np.linalg.norm(points, axis=1)
7
8    print(f"Dim={dim}: mean={dists.mean():.2f}, "
9          f"std={dists.std():.2f}")
10
11# Output:
12# Dim=2:   mean=1.26, std=0.52
13# Dim=10:  mean=3.08, std=0.49
14# Dim=100: mean=9.95, std=0.50
15# Dim=300: mean=17.3, std=0.50

All points cluster around the same distance from origin!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Why Dimensionality Reduction? 🎯

Visualization:

  • 2D/3D plots
  • Explore semantic structure
  • Identify clusters
  • Discover patterns
  • Quality assurance

Computation:

  • Faster algorithms
  • Less memory
  • Enable real-time systems
  • Scalability

Machine Learning:

  • Reduce overfitting
  • Feature selection
  • Improve generalization
  • Remove noise
  • Combat curse of dimensionality

Interpretation:

  • Understand relationships
  • Identify important dimensions
  • Communicate results
  • Debug models
The Goal

Reduce from dimensions (e.g., 300) to dimensions (e.g., 2 or 3) while

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Principal Component Analysis (PCA) 📊

The classic linear dimensionality reduction method

Intuition:

  • Find directions of maximum variance
  • Project data onto these directions
  • First PC: most variance
  • Second PC: second most (orthogonal)
  • And so on...

Properties:

  • Linear transformation
  • Preserves global structure
  • Deterministic
  • Fast computation
  • Interpretable (sometimes)

PC1 captures most variation

PC2 captures remaining variation (orthogonal to PC1)

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

PCA: The Algorithm 🔧

Step-by-Step Worked Example:

Original Data (3D → 2D):

Word x1 x2 x3
king 2.0 1.5 0.1
queen 1.8 1.6 0.2
man 1.0 0.5 0.1
woman 0.9 0.6 0.2

Step 1: Center data (subtract mean)

Step 2: Compute covariance matrix C

Step 3: Find eigenvectors/eigenvalues

Eigenvalues (variance captured):

  • (85% variance)
  • (13% variance)
  • (2% variance)

Keep top 2 PCs → 98% variance retained!

Projected Data (2D):

Word PC1 PC2
king 1.9 0.3
queen 1.7 0.4
man 0.8 -0.2
woman 0.7 -0.1

Gender still separable, royalty still clusters!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

PCA in Practice 💻


1from sklearn.decomposition import PCA
2from sklearn.preprocessing import StandardScaler
3import numpy as np
4import matplotlib.pyplot as plt
5
6# Assume we have word embeddings: shape (vocab_size, 300)
7# For example, Word2Vec embeddings
8
9# Standardize the data (optional but recommended)
10scaler = StandardScaler()
11embeddings_scaled = scaler.fit_transform(embeddings)
12
13# Apply PCA
14pca = PCA(n_components=2)  # Reduce to 2D for visualization
15embeddings_2d = pca.fit_transform(embeddings_scaled)
16
17# Check variance explained
18print(f"Variance explained: {pca.explained_variance_ratio_}")
19print(f"Total: {sum(pca.explained_variance_ratio_):.2%}")
20
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

PCA in Practice 💻


21# Visualize
22plt.figure(figsize=(10, 8))
23plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
24
25# Annotate some words
26words = ['king', 'queen', 'man', 'woman', 'cat', 'dog']
27for word in words:
28    idx = word_to_idx[word]
29    plt.annotate(word, (embeddings_2d[idx, 0], embeddings_2d[idx, 1]))
30
31plt.xlabel('PC1')
32plt.ylabel('PC2')
33plt.title('Word Embeddings - PCA Projection')
34plt.show()
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

PCA Limitations ⚠️

Assumes Linearity:

  • Only captures linear relationships
  • Word embeddings often have non-linear structure
  • May miss important patterns

1Non-linear manifold ->  PCA struggles here!

Other Issues:

  • Variance ≠ Importance:

  • Preserves variance, not semantic structure

  • Noisy dimensions may have high variance

    \item Global focus:

    • May lose local structure
  • Clusters can become blurred

    \item Interpretability:

    • PCs are linear combinations
  • Hard to interpret semantically

When to Use PCA
  • Quick first exploration
  • Preprocessing for other methods
  • When speed is critical
  • Linear structure expected
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

t-SNE: t-Distributed Stochastic Neighbor Embedding 🎨

Non-linear dimensionality reduction for visualization

Key Idea:

  • Model similarity as probability
  • High-D: Gaussian similarity
  • Low-D: t-distribution similarity
  • Minimize divergence between them
  • Preserves

Advantages:

  • Beautiful visualizations
  • Reveals clusters
  • Non-linear mappings
  • Great for exploration
  • Widely used in practice

How it works:

  1. Compute pairwise similarities in high-D:

    p_{j|i} = {\sum_{k ≠ i} \exp(-||x_i - x_k||^2 / 2\sigma_i^2)}

  2. Compute similarities in low-D using t-distribution:

    q_{ij} = }{\sum_{k ≠ l}(1 + ||y_k - y_l||^2)^{-1}}

  3. Minimize KL divergence:

    C = \sum_i KL(P_i || Q_i)

  4. Use gradient descent to optimize

Reference: van der Maaten & Hinton (2008). "Visualizing Data using t-SNE"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

t-SNE: Key Concepts 🔍

Why t-distribution in low-D?

The Crowding Problem Illustrated:


1High-D: 10 points can each have
2        9 equidistant neighbors
3
4      *   *   *
5        * * *
6      *   *   *
7
8Low-D (2D): Can't fit 9 equidistant
9            neighbors around 1 point!
10
11Solution: t-distribution has
12          heavier tails → allows
13          moderately-distant points
14          to spread further apart

Perplexity Effect (Same Data!):


1Perplexity = 5:
2  Tight, fragmented clusters
3  Good for fine structure
4
5Perplexity = 30:
6  Balanced view
7  Standard choice
8
9Perplexity = 100:
10  Merged clusters
11  More global view

Rule of thumb: perplexity ~ sqrt(n)

For 1000 words: try perplexity 30-50

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

t-SNE in Practice 💻


1from sklearn.manifold import TSNE
2import matplotlib.pyplot as plt
3
4# Apply t-SNE (can be slow for large datasets)
5tsne = TSNE(
6    n_components=2,      # 2D visualization
7    perplexity=30,       # try 5-50
8    n_iter=2000,         # iterations
9    random_state=42,     # reproducibility
10    verbose=1            # show progress
11)
12
13embeddings_2d = tsne.fit_transform(embeddings)
14
15# Visualize with labels
16plt.figure(figsize=(12, 10))
17
18# Color by semantic category (if available)
19categories = ['animals', 'food', 'technology', ...]
20colors = ['red', 'blue', 'green', ...]
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

t-SNE in Practice 💻


21
22for cat, color in zip(categories, colors):
23    mask = labels == cat
24    plt.scatter(
25        embeddings_2d[mask, 0],
26        embeddings_2d[mask, 1],
27        c=color,
28        label=cat,
29        alpha=0.6
30    )
31
32# Annotate some words
33for word, idx in word_to_idx.items():
34    if word in important_words:
35        plt.annotate(
36            word,
37            (embeddings_2d[idx, 0], embeddings_2d[idx, 1])
38        )
39
40plt.legend()
...continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

t-SNE in Practice 💻


41plt.title('Word Embeddings - t-SNE Projection')
42plt.show()
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

t-SNE Limitations and Caveats ⚠️

1. Slow: complexity


1# Timing comparison
2n=1000:  ~10 seconds
3n=5000:  ~4 minutes
4n=10000: ~15 minutes

2. Non-deterministic:


1tsne1 = TSNE(random_state=42)
2tsne2 = TSNE(random_state=123)
3# Different layouts!

3. No out-of-sample:


1# Can't do this with t-SNE:
2new_point_2d = tsne.transform(new_point)
3# Error! Must refit entire dataset

4. Interpretation Pitfalls:


1WRONG interpretations:
2✗ "Cluster A is bigger than B"
3  (sizes are arbitrary)
4
5✗ "A and B are far apart"
6  (global distances not preserved)
7
8✗ "This dimension means X"
9  (axes have no meaning)
10
11CORRECT interpretations:
12✓ "Points in cluster A are similar"
13✓ "These words form a group"
14✓ "There appear to be N clusters"

Solution: Use PCA first to reduce to 50D, then t-SNE to 2D

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

UMAP: Uniform Manifold Approximation & Projection 🗺️

Modern alternative to t-SNE (2018)

Key Advantages over t-SNE:

  • ( vs )
  • Preserves local and global structure
  • Can transform new points
  • Theoretically grounded (topology)
  • Scales to millions of points
  • More robust hyperparameters

When to Use:

  • Large datasets ($>$10k points)
  • Need global structure
  • Production systems
  • Consistent results important
  • Most NLP visualization tasks!

How it Works:

  1. Construct fuzzy topological representation of high-D data
  2. Find low-D representation with similar topology
  3. Use Riemannian geometry
  4. Optimize cross-entropy loss

Main Hyperparameters:

1. n_neighbors (15):

  • Local vs. global balance
  • Small: local structure
  • Large: global structure

2. min_dist (0.1):

  • How tightly to pack points
  • Small: tight clusters
  • Large: loose, spread out

Reference: McInnes & Healy (2018). "UMAP: Uniform Manifold Approximation and Projection"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

UMAP in Practice 💻


1import umap
2import matplotlib.pyplot as plt
3
4# Apply UMAP
5reducer = umap.UMAP(
6    n_components=2,      # 2D visualization
7    n_neighbors=15,      # local/global balance (try 5-50)
8    min_dist=0.1,        # cluster tightness (try 0.0-0.99)
9    metric='cosine',     # good for word embeddings
10    random_state=42
11)
12
13embeddings_2d = reducer.fit_transform(embeddings)
14
15# Can also transform new points! (unlike t-SNE)
16new_embeddings_2d = reducer.transform(new_embeddings)
17
18# Visualize
19plt.figure(figsize=(12, 10))
20scatter = plt.scatter(
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

UMAP in Practice 💻


21    embeddings_2d[:, 0],
22    embeddings_2d[:, 1],
23    c=cluster_labels,     # color by cluster
24    cmap='Spectral',
25    s=5,
26    alpha=0.6
27)
28plt.colorbar(scatter)
29
30# Annotate
31for word in important_words:
32    idx = word_to_idx[word]
33    plt.annotate(
34        word,
35        (embeddings_2d[idx, 0], embeddings_2d[idx, 1]),
36        fontsize=12
37    )
38
39plt.title('Word Embeddings - UMAP Projection')
40plt.show()
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

PCA vs. t-SNE vs. UMAP 🔍

Feature PCA t-SNE UMAP
Speed Very Fast Slow Fast
Scalability Excellent Poor (<10k) Excellent
Global structure Yes No Yes
Local structure Partial Yes Yes
Deterministic Yes No Partial
Out-of-sample Yes No Yes

Concrete Timing Comparison (10,000 word embeddings):


1# PCA: 0.3 seconds
2pca = PCA(n_components=2).fit_transform(X)
3
4# t-SNE: 180 seconds (3 minutes!)
5tsne = TSNE(n_components=2).fit_transform(X)
6
7# UMAP: 12 seconds
8umap = UMAP(n_components=2).fit_transform(X)
Best Practice Workflow

1# Step 1: PCA to 50D (fast, removes noise)
2X_50d = PCA(n_components=50).fit_transform(X)
3
4# Step 2: UMAP to 2D (preserves structure)
5X_2d = UMAP(n_components=2).fit_transform(X_50d)
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Visualization Best Practices 🎨

  1. Preprocessing:
    • Standardize features (mean=0, std=1)
  • Remove outliers (optional)
  • For very large datasets: sample or use PCA first
  1. Try multiple hyperparameters:
    • t-SNE perplexity: 5, 10, 30, 50
  • UMAP n_neighbors: 5, 15, 30, 50
  • Different views reveal different structure
  1. Color intelligently:
    • By semantic category
  • By cluster assignment
  • By frequency (size)
  • By POS tag, domain, etc.
  1. Interactive visualization:
    • Use plotly, bokeh for interactivity
  • Hover tooltips with word info
  • Zoom and pan
  • Filter by category
  1. Annotate selectively:
    • Too many labels = clutter
  • Show representative words
  • Use repel/adjust_text to avoid overlap
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Interactive Visualization 💡


1import plotly.express as px
2import plotly.graph_objects as go
3import pandas as pd
4
5# Prepare data
6df = pd.DataFrame({
7    'x': embeddings_2d[:, 0],
8    'y': embeddings_2d[:, 1],
9    'word': words,
10    'category': categories,
11    'frequency': frequencies
12})
13
14# Create interactive plot with plotly
15fig = px.scatter(
16    df,
17    x='x',
18    y='y',
19    color='category',
20    size='frequency',
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Interactive Visualization 💡


21    hover_data=['word', 'frequency'],
22    title='Word Embeddings (UMAP)',
23    width=1000,
24    height=800
25)
26
27# Add text annotations for important words
28for word in important_words:
29    row = df[df['word'] == word].iloc[0]
30    fig.add_annotation(
31        x=row['x'],
32        y=row['y'],
33        text=word,
34        showarrow=False,
35        font=dict(size=12)
36    )
37
38# Show
39fig.show()
40
...continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Interactive Visualization 💡


41# Can also save as HTML
42fig.write_html('embeddings_viz.html')
...continued
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Applications in NLP 🚀

1. Embedding Quality Check:


1# Quick sanity check
2words = ['dog', 'cat', 'fish',  # animals
3         'car', 'bus', 'train'] # vehicles
4
5# If good embeddings: two clusters!
6# If bad: random scatter

2. Finding Polysemous Words:


1"apple" appears in TWO clusters:
2- Near: orange, banana, fruit
3- Near: microsoft, google, tech
4
5→ Word has multiple senses!

3. Domain Vocabulary Analysis:


1Medical corpus visualization:
2Cluster 1: symptoms (fever, cough...)
3Cluster 2: treatments (aspirin, surgery...)
4Cluster 3: anatomy (heart, liver...)

4. Model Layer Comparison:


1# Extract embeddings from different layers
2layer_1 = get_bert_layer(1)   # Syntax
3layer_6 = get_bert_layer(6)   # Semantics
4layer_12 = get_bert_layer(12) # Task-specific
5
6# Visualize each: different structure!

5. Document Clustering:


1# Visualize document embeddings
2doc_embeddings = model.encode(documents)
3umap_2d = umap.UMAP().fit_transform(
4    doc_embeddings)
5
6# Color by topic → see topic separation
7plt.scatter(umap_2d[:, 0], umap_2d[:, 1],
8            c=topic_labels)
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Case Study: Visualizing BERT Layers 🔬

Question: What do different BERT layers capture?

Approach:

  1. Extract embeddings from each layer
  2. Apply UMAP to each layer separately
  3. Visualize and compare

Typical Findings:

  • Layer 0-2: Syntactic (POS tags cluster)
  • Layer 3-8: Semantic (meaning clusters)
  • Layer 9-12: Task-specific

Insights:

  • Lower layers: syntax
  • Middle layers: semantics
  • Upper layers: task adaptation
  • Confirms linguistic hierarchy

1from transformers import BertModel, BertTokenizer
2
3model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
4tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
5
6# Get embeddings from all layers
7tokens = tokenizer(sentences, return_tensors='pt', padding=True)
8outputs = model(**tokens)
9
10# outputs.hidden_states: tuple of 13 tensors
11# (embedding layer + 12 transformer layers)
12
13for layer_idx in range(13):
14    layer_embeddings = outputs.hidden_states[layer_idx]
15
16    # Average over sequence length
17    avg_embeddings = layer_embeddings.mean(dim=1).numpy()
18
19    # Apply UMAP
20    reducer = umap.UMAP(n_components=2)
continued...
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Case Study: Visualizing BERT Layers 🔬


21    viz_2d = reducer.fit_transform(avg_embeddings)
22
23    # Plot
24    plt.figure()
25    plt.scatter(viz_2d[:, 0], viz_2d[:, 1], c=pos_tags)
26    plt.title(f'Layer {layer_idx}')
27    plt.show()
...continued

Reference: Jawahar et al. (2019). "What Does BERT Learn about the Structure of Language?"

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Discussion Question 💬

What does a 2D visualization actually tell us about 300D space?

Consider:

  • We compress 300 dimensions into 2
  • Massive information loss (>99%)
  • Different methods show different views
  • Hyperparameters change the story

What we CAN conclude:

  • Rough semantic groupings
  • Relative proximities
  • Cluster existence
  • Outliers and anomalies
  • Qualitative patterns

What we CANNOT conclude:

  • Exact distances
  • True high-D structure
  • Cluster sizes (t-SNE)
  • Between-cluster distances
  • Quantitative relationships
Important

Dimensionality reduction is a . Always combine visualization with quantitative evaluation!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Practical Tips 💡

  1. Choose the right tool:
    • Default: UMAP (fast, balanced)
  • Need speed: PCA
  • Small dataset, only local: t-SNE
  1. Preprocessing matters:
    • StandardScaler for PCA
  • Consider normalization for UMAP/t-SNE
  • Remove extreme outliers
  1. Try multiple settings:
    • Run with different hyperparameters
  • Compare results
  • Don't cherry-pick!
  1. Use color wisely:
    • Semantic categories
  • Continuous values (frequency, sentiment)
  • Consider colorblind-friendly palettes
  1. Save your random seeds:
    • Makes results reproducible
  • Important for publications
  • Helps debugging
  1. Computational tips:
    • For $>$100k points: PCA first to 50D, then UMAP
  • Use n_jobs=-1 for parallel computation
  • Consider approximate nearest neighbors (Annoy, FAISS)
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Summary 🎯

What we learned today:

  1. Curse of Dimensionality: High-D space is weird, need reduction
  2. PCA:
    • Linear, fast, preserves global structure
  • Good for preprocessing and quick exploration
  1. t-SNE:
    • Non-linear, preserves local structure
  • Beautiful visualizations but slow
  • Sensitive to hyperparameters
  1. UMAP:
    • Fast, scalable, balanced local+global
  • Best default choice for NLP
  • Can transform new points
  1. Best Practices:
    • Try multiple methods and parameters
  • Use interactive visualizations
  • Don't over-interpret 2D projections

Next: Cognitive models of semantic representation!

Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Key References 📚

Foundational Papers:

  • Pearson, K. (1901). "On Lines and Planes of Closest Fit to Systems of Points in Space" (PCA)
  • van der Maaten & Hinton (2008). "Visualizing Data using t-SNE"
  • McInnes et al. (2018). "UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction"

Applications:

  • Jawahar et al. (2019). "What Does BERT Learn about the Structure of Language?"
  • Coenen et al. (2019). "Visualizing and Measuring the Geometry of BERT"

Guides & Tutorials:

Tools:

  • Scikit-learn: PCA, t-SNE
  • UMAP-learn: pip install umap-learn
  • Plotly: pip install plotly
Winter 2026
PSYC 51.07: Models of Language and Communication - Week 4

Questions? 🙋

Next Lecture:

Cognitive Models of Semantic Representation

How do humans represent meaning?

Winter 2026

Timeline - see original for details