* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
Words that appear in similar contexts should have similar representations
"The quick brown [TARGET] jumped over the lazy dog"
Context: [The, quick, brown, jumped, over, the, lazy, dog]
Target: fox
Key Idea:
We don't actually care about the prediction task—we want the embeddings!
Sentence: "The cat sat on the mat"
Window size = 2 (2 words on each side)
1Position 0: "The" → Context: [cat, sat]
2Position 1: "cat" → Context: [The, sat, on]
3Position 2: "sat" → Context: [The, cat, on, the]
4Position 3: "on" → Context: [cat, sat, the, mat]
5Position 4: "the" → Context: [sat, on, mat]
6Position 5: "mat" → Context: [on, the]
Skip-gram training pairs (target → context):
1(The, cat), (The, sat)
2(cat, The), (cat, sat), (cat, on)
3(sat, The), (sat, cat), (sat, on), (sat, the)
4...
Result: Words appearing in similar contexts get similar vectors!
Predict the center word from context words
1Context words:
2 "the" ──┐
3 "cat" ──┼──→ Average ──→ "sat"
4 "on" ──┼──→ Hidden ──→ (predict)
5 "the" ──┘
Architecture:
Training:
Characteristics:
Objective:
Predict context words from the center word
1Center word: Predict context:
2 ┌──→ "the"
3 ├──→ "cat"
4 "sat" ──→ Hidden ┼──→ "on"
5 └──→ "the"
Architecture:
Training:
Characteristics:
Objective:
Problem: Softmax over entire vocabulary is too expensive!
For 100k vocabulary: Need to compute 100k exponentials per training example!
Solution: Negative Sampling
Instead of predicting across all words, create a binary classification:
Typical K: 5-20 for large datasets, 2-5 for small datasets
Training becomes
Reference: Mikolov et al. (2013b). "Distributed Representations of Words and Phrases"
Training pair: ("cat", "sat") - cat is center, sat is context
1# Positive sample: Does "sat" appear near "cat"? YES (label=1)
2positive_pair = ("cat", "sat", label=1)
3
4# Negative samples: Random words that DON'T appear near "cat"
5# Sample 5 random words from vocabulary
6negative_pairs = [
7 ("cat", "algorithm", label=0),
8 ("cat", "president", label=0),
9 ("cat", "quantum", label=0),
10 ("cat", "democracy", label=0),
11 ("cat", "software", label=0),
12]
13
14# Train binary classifier: Is this a real context pair?
15# Instead of 100k-way softmax → 6 binary predictions!
Sampling distribution:
The 0.75 power gives rare words slightly higher probability of being sampled as negatives.
1from gensim.models import Word2Vec
2import gensim.downloader as api
3
4# Load pre-trained model (300 dimensions, 3B words)
5model = api.load('word2vec-google-news-300')
6
7# Find similar words
8similar = model.most_similar('computer', topn=5)
9print(similar)
10# Output: [('computers', 0.72), ('laptop', 0.69), ('PC', 0.68), ...]
11
12# Word analogies: king - man + woman = ?
13result = model.most_similar(
14 positive=['king', 'woman'],
15 negative=['man'],
16 topn=1
17)
18print(result)
19# Output: [('queen', 0.71)]
20
21# Train your own model
22sentences = [
23 ['the', 'cat', 'sat', 'on', 'the', 'mat'],
24 ['the', 'dog', 'ran', 'in', 'the', 'park'],
25 # ... more sentences
26]
27
28custom_model = Word2Vec(
29 sentences,
30 vector_size=100, # embedding dimension
31 window=5, # context window
32 min_count=1, # ignore rare words
33 sg=1, # 1=skip-gram, 0=CBOW
34 negative=5, # negative sampling
35 workers=4 # parallel threads
36)
Combining the best of count-based and prediction-based methods
Motivation:
Key Insight:
Ratios of co-occurrence probabilities encode meaning better than raw probabilities!
Co-occurrence Example:
| Probe word | P(word|ice) | P(word|steam) | Ratio |
|---|---|---|---|
| solid | high | low | >> 1 |
| gas | low | high | << 1 |
| water | high | high | ~ 1 |
| fashion | low | low | ~ 1 |
Idea:
Learn word vectors such that their dot product relates to co-occurrence probability ratios
Reference: Pennington et al. (2014). "GloVe: Global Vectors for Word Representation"
Why ratios matter more than raw probabilities:
1Given words: "ice" and "steam"
2Probe with: "solid", "gas", "water"
3
4P(solid | ice) = 0.00019 P(solid | steam) = 0.000022
5P(gas | ice) = 0.000066 P(gas | steam) = 0.00078
6
7Ratio: P(solid|ice) / P(solid|steam) = 8.9 → "solid" relates to ice
8Ratio: P(gas|ice) / P(gas|steam) = 0.085 → "gas" relates to steam
9Ratio: P(water|ice) / P(water|steam) = 1.36 → "water" is neutral
The insight: These ratios distinguish relevant context words from irrelevant ones!
GloVe learns vectors where:
Goal: Learn vectors that capture co-occurrence statistics
Objective Function:
where:
Weighting Function:
Purpose: Prevent very common co-occurrences from dominating (typically
Word2Vec (Skip-gram):
Advantages:
GloVe:
Advantages:
In practice: Similar performance on most tasks! Choose based on:
1import gensim.downloader as api
2import numpy as np
3
4# Load pre-trained GloVe embeddings
5# Options: glove-wiki-gigaword-50, -100, -200, -300
6# Or: glove-twitter-25, -50, -100, -200
7glove = api.load("glove-wiki-gigaword-100")
8
9# Use just like Word2Vec
10similar = glove.most_similar('computer', topn=5)
11print(similar)
12
13# Analogies
14result = glove.most_similar(
15 positive=['france', 'berlin'],
16 negative=['paris'],
17 topn=1
18)
19print(result) # Should be close to 'germany'
20
21# Vector arithmetic
22king = glove['king']
23man = glove['man']
24woman = glove['woman']
25result_vec = king - man + woman
26
27# Find closest word
28closest = glove.similar_by_vector(result_vec, topn=1)
29print(closest) # Should be close to 'queen'
30
31# Compute similarity
32similarity = glove.similarity('cat', 'dog')
33print(f"Similarity: {similarity:.3f}")
The Problem with Word2Vec and GloVe:
Example:
FastText Solution: Represent words as
Reference: Bojanowski et al. (2017). "Enriching Word Vectors with Subword Information"
Key Idea: Break words into character n-grams
Character trigrams: <wh, whe, her, ere, re>
Plus: <where> (the whole word)
Word vector:
Advantages:
Example OOV:
Have trained on: "run", "running"
Need vector for: "runnable"
N-grams: <ru, run, unn, nna, nab, abl, ble, le>
Many overlap with "run" and "running"!
How FastText handles related words:
1# Word: "unhappiness" broken into character 3-grams:
2# <un, unh, nha, hap, app, ppi, pin, ine, nes, ess, ss>
3
4# Shares n-grams with:
5# "unhappy" → <un, unh, nha, hap, app, ppy
6# "happiness" → hap, app, ppi, pin, ine, nes, ess, ss>
7# "happy" → hap, app, ppy
8
9# Therefore: vec("unhappiness") is close to:
10# - vec("unhappy") (prefix overlap)
11# - vec("happiness") (suffix overlap)
12# - vec("sadness") (similar suffix pattern)
This is why FastText excels at morphologically rich languages!
| Language | Morphology | FastText Advantage |
|---|---|---|
| English | Low | Moderate |
| German | High | Significant |
| Turkish | Very High | Essential |
| Finnish | Extreme | Critical |
1from gensim.models import FastText
2import gensim.downloader as api
3
4# Load pre-trained FastText model
5# Available in 157 languages!
6fasttext_model = api.load('fasttext-wiki-news-subwords-300')
7
8# Works for known words
9vec_cat = fasttext_model['cat']
10
11# Also works for unknown words! (OOV)
12vec_unknownword = fasttext_model['unknownword'] # Still get a vector!
13
14# Even works for misspellings (somewhat)
15vec_misspelling = fasttext_model['computr'] # Close to "computer"
16
17# Train your own FastText model
18sentences = [['the', 'cat', 'sat'], ['the', 'dog', 'ran']]
19
20ft_model = FastText(
21 sentences,
22 vector_size=100,
23 window=5,
24 min_count=1,
25 min_n=3, # minimum n-gram length
26 max_n=6, # maximum n-gram length
27 word_ngrams=1 # use word + ngrams
28)
29
30# Check similar words
31similar = ft_model.wv.most_similar('cat', topn=5)
32
33# Morphological awareness
34# 'running', 'runner', 'runnable' will be close
| LDA | 2003 | Probabilistic | ✗ | Slow | Topic modeling |
|---|---|---|---|---|---|
| Word2Vec | 2013 | Neural (local) | ✗ | Fast | General NLP |
| GloVe | 2014 | Hybrid (global) | ✗ | Fast | Large corpora |
| FastText | 2017 | Neural + ngrams | ✓ | Fast | Morphology-rich |
Typical dimensions: 50-300 (100-300 most common)
Intrinsic Evaluation:
1. Word Similarity
2. Word Analogies
3. Categorization
Extrinsic Evaluation:
Use embeddings as features in downstream tasks:
Intrinsic scores don't always correlate with extrinsic performance!
Best approach: Evaluate on your actual task.
The fundamental problem: One vector per word type
All get the ! This conflates different meanings.
Other Limitations:
The Solution:
% \includegraphics[width=0.8\textwidth]{transformer_preview.png}
(Image placeholder - transformer architecture preview)
Embeddings learn the biases in their training data
Other Biases:
Why This Matters:
References: Bolukbasi et al. (2016). "Man is to Computer Programmer as Woman is to Homemaker?"
Caliskan et al. (2017). "Semantics derived automatically from language corpora contain human-like biases"
1import gensim.downloader as api
2model = api.load('word2vec-google-news-300')
3
4# Measure gender bias: which words are closer to "man" vs "woman"?
5def gender_bias_score(word):
6 """Positive = closer to man, Negative = closer to woman"""
7 return model.similarity(word, 'man') - model.similarity(word, 'woman')
8
9occupations = ['doctor', 'nurse', 'engineer', 'teacher',
10 'programmer', 'secretary', 'scientist', 'receptionist']
11
12for job in occupations:
13 score = gender_bias_score(job)
14 direction = "→ man" if score > 0 else "→ woman"
15 print(f"{job:12}: {score:+.3f} {direction}")
16
17# Output:
18# doctor : +0.089 → man
19# nurse : -0.109 → woman
20# engineer : +0.078 → man
21# teacher : -0.046 → woman
22# programmer : +0.091 → man
23# secretary : -0.107 → woman
These biases reflect stereotypes in the training data (news articles)!
How can we reduce bias in embeddings?
1. Post-processing:
2. Training-time:
3. Data curation:
Challenges:
Debiasing is an active research area. No perfect solution yet!
Best practice:
1import gensim.downloader as api
2
3# Word2Vec (Google News, 3B words, 300d)
4w2v = api.load('word2vec-google-news-300')
5
6# GloVe options
7glove_50d = api.load('glove-wiki-gigaword-50') # Small, fast
8glove_100d = api.load('glove-wiki-gigaword-100') # Balanced
9glove_300d = api.load('glove-wiki-gigaword-300') # Best quality
10glove_twitter = api.load('glove-twitter-100') # Social media
11
12# FastText (handles OOV!)
13fasttext = api.load('fasttext-wiki-news-subwords-300')
14
15# Basic usage (same for all)
16similar = model.most_similar('computer', topn=5)
17vector = model['cat']
18similarity = model.similarity('dog', 'cat')
Pro tip: Start with glove-wiki-gigaword-100 for a good balance of speed and quality!
Search & Information Retrieval:
Text Classification:
Sequence Labeling:
Recommendation Systems:
Machine Translation:
Creative Applications:
Example: Google Search uses embeddings to understand query intent and match relevant documents!
Why do word analogies work so well?
The famous example:
Think about:
Are these embeddings truly capturing meaning, or just statistical patterns?
Does the distributional hypothesis have limits?
Reference: Boleda (2020). "Distributional Semantics and Linguistic Theory"
What we learned today:
Foundational Papers:
Bias & Ethics:
Theory:
Tools:
Next Lecture:
Contextual Embeddings: ELMo, Universal Sentence Encoder, BERT
One word, multiple meanings!