* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
1from sklearn.decomposition import TruncatedSVD
2from sklearn.feature_extraction.text import TfidfVectorizer
3
4# Build TF-IDF matrix
5tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
6tfidf_matrix = tfidf.fit_transform(documents)
7
8# Apply LSA
9lsa = TruncatedSVD(n_components=100, random_state=42)
10doc_embeddings = lsa.fit_transform(tfidf_matrix)
11word_embeddings = lsa.components_.T
12
13print(f"Explained variance: {lsa.explained_variance_ratio_.sum():.2%}")
Try it: Find similar words using cosine similarity!
1from sklearn.metrics.pairwise import cosine_similarity
2import numpy as np
3
4# Get vocabulary mapping
5vocab = tfidf.get_feature_names_out()
6word_to_idx = {word: i for i, word in enumerate(vocab)}
7
8def find_similar_words(word, top_n=5):
9 """Find words with similar LSA embeddings."""
10 if word not in word_to_idx:
11 return f"'{word}' not in vocabulary"
12
13 idx = word_to_idx[word]
14 word_vec = word_embeddings[idx].reshape(1, -1)
15
16 # Compute similarities to all words
17 sims = cosine_similarity(word_vec, word_embeddings)[0]
18
19 # Get top N (excluding the word itself)
20 top_indices = sims.argsort()[-top_n-1:-1][::-1]
21 return [(vocab[i], f"{sims[i]:.3f}") for i in top_indices]
22
23print(find_similar_words("computer"))
24# Output: [('software', 0.82), ('program', 0.79),
25# ('system', 0.71), ('hardware', 0.68), ('disk', 0.65)]
A probabilistic approach:
LDA imagines documents are created by:
Key difference from LSA:
1Topic 0: hockey, game, team, player, season, nhl, play
2Topic 1: space, nasa, launch, orbit, shuttle, satellite
3Topic 2: computer, software, program, file, windows, system
4Topic 3: medical, doctor, patient, disease, health, treatment
5Topic 4: government, president, congress, law, political
Each document is a mixture of topics:
Document #42: 60% Space + 25% Computer + 15% Other
1from sklearn.decomposition import LatentDirichletAllocation
2from sklearn.feature_extraction.text import CountVectorizer
3
4# 20 Newsgroups sample documents
5documents = [
6 "The hockey team scored three goals in the game",
7 "NASA launched a new satellite into orbit",
8 "Install the software program on your computer",
9 "The doctor prescribed medicine for the patient",
10 # ... more documents
11]
12
13# Step 1: Create bag-of-words matrix
14vectorizer = CountVectorizer(max_features=1000, stop_words='english')
15bow_matrix = vectorizer.fit_transform(documents)
16
17# Step 2: Fit LDA
18lda = LatentDirichletAllocation(n_components=5, random_state=42)
19doc_topics = lda.fit_transform(bow_matrix)
20
21# Step 3: Print topics
22vocab = vectorizer.get_feature_names_out()
23for topic_idx, topic in enumerate(lda.components_):
24 top_words = [vocab[i] for i in topic.argsort()[-7:]]
25 print(f"Topic {topic_idx}: {', '.join(top_words)}")
Learning embeddings from context:
Skip-gram:
Given target word, predict context
"The cat sat on mat"
CBOW:
Given context, predict target
the, sat, on → cat
1from gensim.models import Word2Vec
2
3model = Word2Vec(
4 sentences=tokenized_docs,
5 vector_size=100,
6 window=5,
7 min_count=5,
8 sg=1 # Skip-gram
9)
1# Find similar words
2model.wv.most_similar('computer', topn=5)
3# [('software', 0.82), ('program', 0.79), ('system', 0.75), ...]
4
5# Word analogies
6model.wv.most_similar(
7 positive=['woman', 'king'],
8 negative=['man']
9)
10# [('queen', 0.71), ...]
Try creating your own word analogies! What works? What fails?
1# Analogies that typically WORK well:
2model.wv.most_similar(positive=['paris', 'germany'], negative=['france'])
3# → 'berlin' (capital cities)
4
5model.wv.most_similar(positive=['walking', 'swam'], negative=['swimming'])
6# → 'walked' (verb tenses)
7
8model.wv.most_similar(positive=['bigger', 'cold'], negative=['big'])
9# → 'colder' (comparatives)
10
11# Analogies that often FAIL:
12model.wv.most_similar(positive=['doctor', 'woman'], negative=['man'])
13# → might return 'nurse' instead of 'doctor' (reflects bias!)
14
15model.wv.most_similar(positive=['sushi', 'italy'], negative=['japan'])
16# → uncertain results (cultural associations are noisy)
Discussion: Why do some analogies work better than others?
Projecting 100D → 2D:
1import umap
2
3reducer = umap.UMAP(
4 n_neighbors=15,
5 min_dist=0.1,
6 metric='cosine'
7)
8
9embeddings_2d = reducer.fit_transform(word_vectors)
UMAP advantages:
When you visualize embeddings:
Sports cluster:
Space cluster:
Tech cluster:
Medical cluster:
Related words should cluster together even though we never told the model they were related!
1import umap
2import matplotlib.pyplot as plt
3
4# Get word vectors for a subset of interesting words
5words_to_plot = ['hockey', 'baseball', 'player', 'team', 'game',
6 'nasa', 'shuttle', 'orbit', 'space', 'satellite',
7 'computer', 'software', 'program', 'windows', 'disk',
8 'doctor', 'patient', 'hospital', 'disease', 'treatment']
9
10word_vectors = np.array([model.wv[w] for w in words_to_plot])
11
12# Reduce to 2D with UMAP
13reducer = umap.UMAP(n_neighbors=5, min_dist=0.3, metric='cosine')
14embeddings_2d = reducer.fit_transform(word_vectors)
15
16# Plot
17plt.figure(figsize=(12, 8))
18plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], alpha=0.7)
19for i, word in enumerate(words_to_plot):
20 plt.annotate(word, (embeddings_2d[i, 0], embeddings_2d[i, 1]))
21plt.title("Word Embeddings Visualized with UMAP")
22plt.savefig("word_clusters.png")
| Method | Speed | Interpretability | Quality | Data Needed |
|---|---|---|---|---|
| LSA | Fast | Medium | Medium | Small-Medium |
| LDA | Medium | High | Medium | Medium |
| Word2Vec | Medium | Low | High | Large |
Recommendations:
Using embeddings as features:
1def document_vector(doc, model):
2 """Average word vectors for document."""
3 tokens = preprocess(doc)
4 vectors = [model.wv[w] for w in tokens if w in model.wv]
5 return np.mean(vectors, axis=0) if vectors else np.zeros(100)
6
7# Train classifier
8X_train = [document_vector(doc, w2v) for doc in train_docs]
9clf = LogisticRegression()
10clf.fit(X_train, y_train)
Compare to TF-IDF baseline!
1from sklearn.linear_model import LogisticRegression
2from sklearn.model_selection import cross_val_score
3
4# Method 1: TF-IDF baseline
5tfidf = TfidfVectorizer(max_features=5000)
6X_tfidf = tfidf.fit_transform(train_docs)
7clf_tfidf = LogisticRegression(max_iter=1000)
8tfidf_scores = cross_val_score(clf_tfidf, X_tfidf, y_train, cv=5)
9
10# Method 2: LSA embeddings
11lsa = TruncatedSVD(n_components=100)
12X_lsa = lsa.fit_transform(X_tfidf)
13clf_lsa = LogisticRegression(max_iter=1000)
14lsa_scores = cross_val_score(clf_lsa, X_lsa, y_train, cv=5)
15
16# Method 3: Word2Vec embeddings
17X_w2v = np.array([document_vector(doc, model) for doc in train_docs])
18clf_w2v = LogisticRegression(max_iter=1000)
19w2v_scores = cross_val_score(clf_w2v, X_w2v, y_train, cv=5)
20
21print(f"TF-IDF: {tfidf_scores.mean():.3f} (+/- {tfidf_scores.std():.3f})")
22print(f"LSA: {lsa_scores.mean():.3f} (+/- {lsa_scores.std():.3f})")
23print(f"Word2Vec: {w2v_scores.mean():.3f} (+/- {w2v_scores.std():.3f})")
Embeddings capture semantic meaning - similar words have similar vectors
Different methods, different strengths:
Visualization reveals structure - UMAP shows semantic clusters
Limitations:
Next week: Contextual embeddings (BERT, GPT)!
Why does vector arithmetic work? What does "king - man + woman" really mean geometrically?
Bias in embeddings: If Word2Vec learns from news articles, what biases might it capture?
Window size matters: What happens with window=2 vs window=10?
Out-of-vocabulary problem: How do you handle words not in your vocabulary?
When to use what: For a sentiment analysis task, would you choose LSA, LDA, or Word2Vec?
For Assignment 2:
Coming up in Lecture 11:
Office hours: Available if you need help with the notebook!