* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
Goal: Represent each word as a point in high-dimensional space
Consider a tiny corpus with 3 documents:
Word-context co-occurrence:
| Word | appears with "the" | appears with "sat" | appears with "ran" |
|---|---|---|---|
| cat | 2 | 1 | 0 |
| dog | 2 | 0 | 1 |
| mat | 1 | 1 | 0 |
Problem: Real vocabularies have 10,000+ words, contexts are even more numerous!
Step 1: Count how often each word appears in each document
| cat | 0 | 4 | 1 | 0 |
|---|---|---|---|---|
| computer | 0 | 0 | 0 | 5 |
| code | 0 | 0 | 0 | 3 |
| animal | 2 | 3 | 2 | 0 |
Issues:
\pause
Apply TF-IDF weighting and dimensionality reduction!
Term Frequency - Inverse Document Frequency
Term Frequency (TF):
How often does term
Rewards frequent terms in document
Inverse Document Frequency (IDF):
How rare is term
Penalizes common words like "the", "and"
Corpus: 3 documents, 1000 total documents in collection
| Document | Text |
|---|---|
| Doc 1 | "machine learning is great" |
| Doc 2 | "deep learning models" |
| Doc 3 | "machine translation works" |
Calculate TF-IDF for "learning" in Doc 1:
1Step 1: TF("learning", Doc1) = 1/4 = 0.25 (1 occurrence, 4 words)
2
3Step 2: IDF("learning") = log(1000/500) = log(2) = 0.301
4 (appears in 500 of 1000 docs)
5
6Step 3: TF-IDF = 0.25 × 0.301 = 0.075
Compare: "the" (appears in 950 docs):
1IDF("the") = log(1000/950) = 0.022 ← Much lower! Common words penalized
The OG of semantic embeddings (1990)
Algorithm:
Key Innovation:
SVD Decomposition:
1X = U × Σ × V^T
2(m×n) (m×k) (k×k) (k×n)
3
4words×docs words× topic topics×
5 topics strength docs
Matrix Interpretation:
Reference: Deerwester et al. (1990) - "Indexing by Latent Semantic Analysis"
Mini corpus (4 words × 3 documents):
1 Doc1 Doc2 Doc3
2cat 2 0 1
3dog 0 3 1
4pet 1 2 0
5animal 1 1 1
Step 1: Apply SVD
Step 2: Keep top 2 dimensions (k=2)
1from sklearn.decomposition import TruncatedSVD
2import numpy as np
3
4X = np.array([[2,0,1], [0,3,1], [1,2,0], [1,1,1]])
5svd = TruncatedSVD(n_components=2)
6word_embeddings = svd.fit_transform(X)
7
8print("Word vectors (2D):")
9print(f"cat: [{word_embeddings[0,0]:.2f}, {word_embeddings[0,1]:.2f}]")
10print(f"dog: [{word_embeddings[1,0]:.2f}, {word_embeddings[1,1]:.2f}]")
11print(f"pet: [{word_embeddings[2,0]:.2f}, {word_embeddings[2,1]:.2f}]")
12print(f"animal: [{word_embeddings[3,0]:.2f}, {word_embeddings[3,1]:.2f}]")
Result: "pet" and "animal" end up close together in 2D space!
Example: Discovering latent topics
Original Dimensions:
After LSA:
Discovered Topics (example):
computer (0.8), software (0.7), code (0.6), algorithm (0.5)
dog (0.9), cat (0.8), pet (0.7), animal (0.6)
game (0.8), team (0.7), player (0.6), win (0.5)
Words with similar topic distributions are semantically similar!
Computational Issues:
Modeling Issues:
Example Problem: Polysemy
LSA gives for all meanings!
But...
A probabilistic approach to topic modeling
What if we model documents as probabilistic mixtures of topics?
Key Assumptions:
Advantages over LSA:
1Topic 1 -> Topic 2 -> Topic 3 -> sports, game -> tech, code -> animal, pet
Reference: Blei et al. (2003) - "Latent Dirichlet Allocation"
How LDA imagines documents are created:
Generative Process:
Key Parameters:
Inference:
Imagine generating a document about "tech pets":
1Step 1: Pick topic mixture for this document
2 θ_doc = [0.6 Tech, 0.3 Animals, 0.1 Sports]
3
4Step 2: For each word, sample a topic, then sample a word:
5
6Word 1: Sample topic → Tech (60% chance)
7 Sample word from Tech → "software"
8
9Word 2: Sample topic → Animals (30% chance)
10 Sample word from Animals → "cat"
11
12Word 3: Sample topic → Tech (60% chance)
13 Sample word from Tech → "computer"
14
15Word 4: Sample topic → Animals (30% chance)
16 Sample word from Animals → "pet"
17
18Result: "software cat computer pet"
LDA reverses this: Given documents, infer the topics!
1from sklearn.decomposition import LatentDirichletAllocation
2from sklearn.feature_extraction.text import CountVectorizer
3
4# Prepare documents
5documents = [
6 "The cat sat on the mat",
7 "The dog ran in the park",
8 "Machine learning is amazing",
9 # ... more documents
10]
11
12# Create bag-of-words representation
13vectorizer = CountVectorizer(max_features=1000, stop_words='english')
14doc_term_matrix = vectorizer.fit_transform(documents)
15vocab = vectorizer.get_feature_names_out()
16
17# Fit LDA model
18lda = LatentDirichletAllocation(
19 n_components=10, # number of topics
20 random_state=42,
21 max_iter=50
22)
23
24doc_topics = lda.fit_transform(doc_term_matrix)
25
26# Print top words per topic
27for idx, topic in enumerate(lda.components_):
28 top_words = [vocab[i] for i in topic.argsort()[-10:]]
29 print(f"Topic {idx}: {', '.join(top_words)}")
Example Topics from News Corpus:
government, election, president, vote, policy, senate, congress, party
game, team, player, score, win, season, coach, championship
software, computer, data, algorithm, system, internet, code, AI
market, stock, price, investment, economy, trade, bank, profit
Document representation: [0.20, 0.15, 0.25, 0.40]
Combining neural embeddings with topic models
BERTopic Pipeline:
Advantages:
1from bertopic import BERTopic
2from sentence_transformers import SentenceTransformer
3
4# Initialize
5embedding_model = SentenceTransformer(
6 "all-MiniLM-L6-v2"
7)
8
9model = BERTopic(
10 embedding_model=embedding_model,
11 language="english",
12 calculate_probabilities=True,
13 verbose=True
14)
15
16# Fit and transform
17topics, probs = model.fit_transform(
18 documents
19)
20
21# Get topic info
22topic_info = model.get_topic_info()
23
24# Visualize
25fig = model.visualize_topics()
Reference: Grootendorst (2022) - "BERTopic: Neural topic modeling with a class-based TF-IDF procedure"
| Interpretability | Medium | High | High |
|---|---|---|---|
| Topic coherence | Medium | High | Very High |
| Computation | Fast | Medium | Slow |
| Scalability | Good | Good | Medium |
| Pre-training | No | No | Yes |
| Hyperparameters | Few | Many | Many |
| Topic number | Fixed | Fixed | Automatic |
Intrinsic Metrics:
Extrinsic Metrics:
Human Evaluation:
Perplexity
Always check topic coherence and interpretability.
1from gensim.models import LdaModel
2from gensim.models.coherencemodel import CoherenceModel
3from gensim.corpora import Dictionary
4
5# Prepare corpus
6texts = [doc.split() for doc in documents]
7dictionary = Dictionary(texts)
8corpus = [dictionary.doc2bow(text) for text in texts]
9
10# Train LDA with different topic numbers
11coherence_scores = []
12for num_topics in [5, 10, 15, 20, 25]:
13 lda = LdaModel(corpus, num_topics=num_topics,
14 id2word=dictionary, passes=10)
15
16 # Calculate coherence (C_V is recommended)
17 coherence = CoherenceModel(model=lda, texts=texts,
18 dictionary=dictionary,
19 coherence='c_v')
20 coherence_scores.append((num_topics, coherence.get_coherence()))
21
22# Results: [(5, 0.42), (10, 0.51), (15, 0.48), (20, 0.45), (25, 0.41)]
23# Best: 10 topics with coherence 0.51
Rule of thumb: Higher coherence = more interpretable topics
Real-world examples:
1from sklearn.decomposition import TruncatedSVD
2from sklearn.metrics.pairwise import cosine_similarity
3
4# Documents already vectorized with TF-IDF
5# tfidf_matrix shape: (1000 docs, 5000 words)
6
7# Apply LSA to reduce dimensions
8lsa = TruncatedSVD(n_components=100)
9doc_embeddings = lsa.fit_transform(tfidf_matrix)
10
11# User query: "machine learning algorithms"
12query_tfidf = vectorizer.transform(["machine learning algorithms"])
13query_embedding = lsa.transform(query_tfidf)
14
15# Find most similar documents
16similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
17top_5_docs = similarities.argsort()[-5:][::-1]
18
19print("Most relevant documents:")
20for idx in top_5_docs:
21 print(f" Doc {idx}: similarity = {similarities[idx]:.3f}")
Key insight: LSA finds documents about "neural networks" and "deep learning" even though those exact words weren't in the query!
Can distributional semantics truly capture meaning?
Arguments For:
Arguments Against:
"How can the semantic interpretation of a formal symbol system be made intrinsic to the system, rather than just parasitic on the meanings in our heads?" - Stevan Harnad (1990)
Reference: Harnad, S. (1990). "The symbol grounding problem"
What we learned today:
Next: Neural word embeddings (Word2Vec, GloVe, FastText)!
Classic Papers:
Modern Extensions:
Tools & Libraries:
sklearn.decomposition.TruncatedSVD, LatentDirichletAllocationgensim.models.LsiModel, LdaModelhttps://maartengr.github.io/BERTopic/Next Lecture:
Word Embeddings: Word2Vec, GloVe, FastText
From count-based to prediction-based methods!