* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
The classic linear dimensionality reduction method
Intuition:
Properties:
PC1 captures most variation
PC2 captures remaining variation (orthogonal to PC1)
Step-by-Step Worked Example:
Original Data (3D → 2D):
| Word | x1 | x2 | x3 |
|---|---|---|---|
| king | 2.0 | 1.5 | 0.1 |
| queen | 1.8 | 1.6 | 0.2 |
| man | 1.0 | 0.5 | 0.1 |
| woman | 0.9 | 0.6 | 0.2 |
Step 1: Center data (subtract mean)
Step 2: Compute covariance matrix C
Step 3: Find eigenvectors/eigenvalues
Eigenvalues (variance captured):
Keep top 2 PCs → 98% variance retained!
Projected Data (2D):
| Word | PC1 | PC2 |
|---|---|---|
| king | 1.9 | 0.3 |
| queen | 1.7 | 0.4 |
| man | 0.8 | -0.2 |
| woman | 0.7 | -0.1 |
Gender still separable, royalty still clusters!
1from sklearn.decomposition import PCA
2from sklearn.preprocessing import StandardScaler
3import numpy as np
4import matplotlib.pyplot as plt
5
6# Assume we have word embeddings: shape (vocab_size, 300)
7# For example, Word2Vec embeddings
8
9# Standardize the data (optional but recommended)
10scaler = StandardScaler()
11embeddings_scaled = scaler.fit_transform(embeddings)
12
13# Apply PCA
14pca = PCA(n_components=2) # Reduce to 2D for visualization
15embeddings_2d = pca.fit_transform(embeddings_scaled)
16
17# Check variance explained
18print(f"Variance explained: {pca.explained_variance_ratio_}")
19print(f"Total: {sum(pca.explained_variance_ratio_):.2%}")
20
21# Visualize
22plt.figure(figsize=(10, 8))
23plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.5)
24
25# Annotate some words
26words = ['king', 'queen', 'man', 'woman', 'cat', 'dog']
27for word in words:
28 idx = word_to_idx[word]
29 plt.annotate(word, (embeddings_2d[idx, 0], embeddings_2d[idx, 1]))
30
31plt.xlabel('PC1')
32plt.ylabel('PC2')
33plt.title('Word Embeddings - PCA Projection')
34plt.show()
Assumes Linearity:
1Non-linear manifold -> PCA struggles here!
Other Issues:
Variance ≠ Importance:
Preserves variance, not semantic structure
Noisy dimensions may have high variance
\item Global focus:
Clusters can become blurred
\item Interpretability:
Hard to interpret semantically
Non-linear dimensionality reduction for visualization
Key Idea:
Advantages:
How it works:
Compute pairwise similarities in high-D:
p_{j|i} = {\sum_{k ≠ i} \exp(-||x_i - x_k||^2 / 2\sigma_i^2)}
Compute similarities in low-D using t-distribution:
q_{ij} = }{\sum_{k ≠ l}(1 + ||y_k - y_l||^2)^{-1}}
Minimize KL divergence:
C = \sum_i KL(P_i || Q_i)
Use gradient descent to optimize
Reference: van der Maaten & Hinton (2008). "Visualizing Data using t-SNE"
Why t-distribution in low-D?
The Crowding Problem Illustrated:
1High-D: 10 points can each have
2 9 equidistant neighbors
3
4 * * *
5 * * *
6 * * *
7
8Low-D (2D): Can't fit 9 equidistant
9 neighbors around 1 point!
10
11Solution: t-distribution has
12 heavier tails → allows
13 moderately-distant points
14 to spread further apart
Perplexity Effect (Same Data!):
1Perplexity = 5:
2 Tight, fragmented clusters
3 Good for fine structure
4
5Perplexity = 30:
6 Balanced view
7 Standard choice
8
9Perplexity = 100:
10 Merged clusters
11 More global view
Rule of thumb: perplexity ~ sqrt(n)
For 1000 words: try perplexity 30-50
1from sklearn.manifold import TSNE
2import matplotlib.pyplot as plt
3
4# Apply t-SNE (can be slow for large datasets)
5tsne = TSNE(
6 n_components=2, # 2D visualization
7 perplexity=30, # try 5-50
8 n_iter=2000, # iterations
9 random_state=42, # reproducibility
10 verbose=1 # show progress
11)
12
13embeddings_2d = tsne.fit_transform(embeddings)
14
15# Visualize with labels
16plt.figure(figsize=(12, 10))
17
18# Color by semantic category (if available)
19categories = ['animals', 'food', 'technology', ...]
20colors = ['red', 'blue', 'green', ...]
21
22for cat, color in zip(categories, colors):
23 mask = labels == cat
24 plt.scatter(
25 embeddings_2d[mask, 0],
26 embeddings_2d[mask, 1],
27 c=color,
28 label=cat,
29 alpha=0.6
30 )
31
32# Annotate some words
33for word, idx in word_to_idx.items():
34 if word in important_words:
35 plt.annotate(
36 word,
37 (embeddings_2d[idx, 0], embeddings_2d[idx, 1])
38 )
39
40plt.legend()
41plt.title('Word Embeddings - t-SNE Projection')
42plt.show()
1. Slow:
1# Timing comparison
2n=1000: ~10 seconds
3n=5000: ~4 minutes
4n=10000: ~15 minutes
2. Non-deterministic:
1tsne1 = TSNE(random_state=42)
2tsne2 = TSNE(random_state=123)
3# Different layouts!
3. No out-of-sample:
1# Can't do this with t-SNE:
2new_point_2d = tsne.transform(new_point)
3# Error! Must refit entire dataset
4. Interpretation Pitfalls:
1WRONG interpretations:
2✗ "Cluster A is bigger than B"
3 (sizes are arbitrary)
4
5✗ "A and B are far apart"
6 (global distances not preserved)
7
8✗ "This dimension means X"
9 (axes have no meaning)
10
11CORRECT interpretations:
12✓ "Points in cluster A are similar"
13✓ "These words form a group"
14✓ "There appear to be N clusters"
Solution: Use PCA first to reduce to 50D, then t-SNE to 2D
Modern alternative to t-SNE (2018)
Key Advantages over t-SNE:
When to Use:
How it Works:
Main Hyperparameters:
1. n_neighbors (15):
2. min_dist (0.1):
Reference: McInnes & Healy (2018). "UMAP: Uniform Manifold Approximation and Projection"
1import umap
2import matplotlib.pyplot as plt
3
4# Apply UMAP
5reducer = umap.UMAP(
6 n_components=2, # 2D visualization
7 n_neighbors=15, # local/global balance (try 5-50)
8 min_dist=0.1, # cluster tightness (try 0.0-0.99)
9 metric='cosine', # good for word embeddings
10 random_state=42
11)
12
13embeddings_2d = reducer.fit_transform(embeddings)
14
15# Can also transform new points! (unlike t-SNE)
16new_embeddings_2d = reducer.transform(new_embeddings)
17
18# Visualize
19plt.figure(figsize=(12, 10))
20scatter = plt.scatter(
21 embeddings_2d[:, 0],
22 embeddings_2d[:, 1],
23 c=cluster_labels, # color by cluster
24 cmap='Spectral',
25 s=5,
26 alpha=0.6
27)
28plt.colorbar(scatter)
29
30# Annotate
31for word in important_words:
32 idx = word_to_idx[word]
33 plt.annotate(
34 word,
35 (embeddings_2d[idx, 0], embeddings_2d[idx, 1]),
36 fontsize=12
37 )
38
39plt.title('Word Embeddings - UMAP Projection')
40plt.show()
| Feature | PCA | t-SNE | UMAP |
|---|---|---|---|
| Speed | Very Fast | Slow | Fast |
| Scalability | Excellent | Poor (<10k) | Excellent |
| Global structure | Yes | No | Yes |
| Local structure | Partial | Yes | Yes |
| Deterministic | Yes | No | Partial |
| Out-of-sample | Yes | No | Yes |
Concrete Timing Comparison (10,000 word embeddings):
1# PCA: 0.3 seconds
2pca = PCA(n_components=2).fit_transform(X)
3
4# t-SNE: 180 seconds (3 minutes!)
5tsne = TSNE(n_components=2).fit_transform(X)
6
7# UMAP: 12 seconds
8umap = UMAP(n_components=2).fit_transform(X)
1# Step 1: PCA to 50D (fast, removes noise)
2X_50d = PCA(n_components=50).fit_transform(X)
3
4# Step 2: UMAP to 2D (preserves structure)
5X_2d = UMAP(n_components=2).fit_transform(X_50d)
1import plotly.express as px
2import plotly.graph_objects as go
3import pandas as pd
4
5# Prepare data
6df = pd.DataFrame({
7 'x': embeddings_2d[:, 0],
8 'y': embeddings_2d[:, 1],
9 'word': words,
10 'category': categories,
11 'frequency': frequencies
12})
13
14# Create interactive plot with plotly
15fig = px.scatter(
16 df,
17 x='x',
18 y='y',
19 color='category',
20 size='frequency',
21 hover_data=['word', 'frequency'],
22 title='Word Embeddings (UMAP)',
23 width=1000,
24 height=800
25)
26
27# Add text annotations for important words
28for word in important_words:
29 row = df[df['word'] == word].iloc[0]
30 fig.add_annotation(
31 x=row['x'],
32 y=row['y'],
33 text=word,
34 showarrow=False,
35 font=dict(size=12)
36 )
37
38# Show
39fig.show()
40
41# Can also save as HTML
42fig.write_html('embeddings_viz.html')
1. Embedding Quality Check:
1# Quick sanity check
2words = ['dog', 'cat', 'fish', # animals
3 'car', 'bus', 'train'] # vehicles
4
5# If good embeddings: two clusters!
6# If bad: random scatter
2. Finding Polysemous Words:
1"apple" appears in TWO clusters:
2- Near: orange, banana, fruit
3- Near: microsoft, google, tech
4
5→ Word has multiple senses!
3. Domain Vocabulary Analysis:
1Medical corpus visualization:
2Cluster 1: symptoms (fever, cough...)
3Cluster 2: treatments (aspirin, surgery...)
4Cluster 3: anatomy (heart, liver...)
4. Model Layer Comparison:
1# Extract embeddings from different layers
2layer_1 = get_bert_layer(1) # Syntax
3layer_6 = get_bert_layer(6) # Semantics
4layer_12 = get_bert_layer(12) # Task-specific
5
6# Visualize each: different structure!
5. Document Clustering:
1# Visualize document embeddings
2doc_embeddings = model.encode(documents)
3umap_2d = umap.UMAP().fit_transform(
4 doc_embeddings)
5
6# Color by topic → see topic separation
7plt.scatter(umap_2d[:, 0], umap_2d[:, 1],
8 c=topic_labels)
Question: What do different BERT layers capture?
Approach:
Typical Findings:
Insights:
1from transformers import BertModel, BertTokenizer
2
3model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
4tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
5
6# Get embeddings from all layers
7tokens = tokenizer(sentences, return_tensors='pt', padding=True)
8outputs = model(**tokens)
9
10# outputs.hidden_states: tuple of 13 tensors
11# (embedding layer + 12 transformer layers)
12
13for layer_idx in range(13):
14 layer_embeddings = outputs.hidden_states[layer_idx]
15
16 # Average over sequence length
17 avg_embeddings = layer_embeddings.mean(dim=1).numpy()
18
19 # Apply UMAP
20 reducer = umap.UMAP(n_components=2)
21 viz_2d = reducer.fit_transform(avg_embeddings)
22
23 # Plot
24 plt.figure()
25 plt.scatter(viz_2d[:, 0], viz_2d[:, 1], c=pos_tags)
26 plt.title(f'Layer {layer_idx}')
27 plt.show()
Reference: Jawahar et al. (2019). "What Does BERT Learn about the Structure of Language?"
What does a 2D visualization actually tell us about 300D space?
Consider:
What we CAN conclude:
What we CANNOT conclude:
Dimensionality reduction is a . Always combine visualization with quantitative evaluation!
What we learned today:
Next: Cognitive models of semantic representation!
Foundational Papers:
Applications:
Guides & Tutorials:
Tools:
pip install umap-learnpip install plotlyNext Lecture:
Cognitive Models of Semantic Representation
How do humans represent meaning?
Timeline - see original for details