* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
Raw input:
1<p>I LOVE this!!!
2https://ex.com
3a@b.com</p>
Steps:
1import re
2raw = "<p>I LOVE this!!! https://ex.com a@b.com</p>"
3
4text = re.sub(r'<.*?>', '', raw) # Step 1
5text = re.sub(r'http\S+', '', text) # Step 2
6text = re.sub(r'\S+@\S+', '', text)
7text = ' '.join(text.split()) # Step 3
8
9print(text) # "I LOVE this!!!"
Full transformation:
| Stage | Text |
|---|---|
| Raw | <p>I LOVE this product!!! https://ex.com</p> |
| Remove HTML | I LOVE this product!!! https://ex.com |
| Remove URLs | I LOVE this product!!! |
| Normalize spaces | I LOVE this product!!! |
| Lowercase | i love this product!!! |
| Remove extra punctuation | i love this product! |
Key decision points:
| Task | Keep | Remove |
|---|---|---|
| Sentiment analysis | Emoji, punctuation (style matters) | Extra whitespace, special chars |
| Named entity recognition | Case (proper nouns important) | Extra whitespace |
| Topic modeling | Content words only | Punctuation, case (lowercase all) |
Preserve information relevant to your task!
What you can collect:
Popular tools:
Check robots.txt, respect rate limits, and honor terms of service and copyright.
Basic example:
1from bs4 import BeautifulSoup
2import requests
3
4# Fetch webpage
5url = "https://example.com/article"
6response = requests.get(url)
7html_content = response.content
8
9# Parse HTML
10soup = BeautifulSoup(html_content, 'html.parser')
11
12# Extract text from specific elements
13title = soup.find('h1').get_text()
14article = soup.find('article').get_text()
15clean_text = ' '.join(article.split()) # Remove extra whitespace
16print(f"Title: {title}\nArticle: {clean_text[:200]}...")
1from bs4 import BeautifulSoup
2
3html = """<div class="article">
4 <h2>Breaking News</h2>
5 <p class="content">First paragraph.</p>
6 <p class="content">Second paragraph.</p>
7 <div class="ads">Advertisement</div>
8</div>"""
9
10soup = BeautifulSoup(html, 'html.parser')
11
12paragraphs = soup.find_all('p', class_='content') # Find content paragraphs
13text = ' '.join([p.get_text() for p in paragraphs])
14
15for ad in soup.find_all('div', class_='ads'): # Remove unwanted sections
16 ad.decompose()
17
18article_text = soup.get_text(separator=' ', strip=True)
19print(article_text)
Common encoding issues:
Best practices:
chardet libraryftfy library
1import chardet
2
3raw_bytes = b'Caf\xe9 au lait \x96 delicious!' # Unknown encoding
4
5result = chardet.detect(raw_bytes) # Detect encoding
6print(f"Detected: {result}") # {'encoding': 'Windows-1252', 'confidence': 0.73}
7
8text = raw_bytes.decode(result['encoding']) # Decode with detected encoding
9print(text) # "Café au lait – delicious!"
10
11import ftfy # Alternative: fix broken Unicode
12broken = "Cafà au lait €" delicious!"
13fixed = ftfy.fix_text(broken)
14print(fixed) # "Café au lait – delicious!"
Pro tip: Save all files as UTF-8 to avoid these headaches!
1. HTML/XML tag removal
<p>Hello</p> → Hello2. Whitespace normalization
"Hello world\n" → "Hello world"3. Punctuation handling
4. Case normalization
5. Special character removal
1import re
2
3def preprocess_text(text):
4 text = re.sub(r'http\S+|www.\S+', '', text) # Remove URLs
5 text = re.sub(r'\S+@\S+', '', text) # Remove emails
6 text = re.sub(r'<.*?>', '', text) # Remove HTML tags
7 text = ' '.join(text.split()) # Remove extra whitespace
8 text = text.lower() # Lowercase
9 return text
10
11raw = "Check out https://example.com! <b>Amazing</b> deals!!"
12print(preprocess_text(raw)) # "check out amazing deals!!"
Stemming
Lemmatization
| Word | Porter Stemmer | Lemmatizer | Notes |
|---|---|---|---|
| ran | ran | run | Lemma recognizes irregular verb |
| runs | run | run | Both work well |
| running | run | run | Both work well |
| runner | runner | runner | Both keep as-is (different word) |
| better | better | good | Lemma handles irregular adjective |
| best | best | good | Lemma handles superlative |
| geese | gees | goose | Stemmer produces non-word! |
| studies | studi | study | Stemmer produces non-word! |
| Word | Porter Stemmer | Lemmatizer | Notes |
|---|---|---|---|
| organizing | organ | organize | Stemmer too aggressive! |
Key insight: Lemmatization produces real words; stemming can produce fragments.
1from nltk.stem import PorterStemmer
2stemmer = PorterStemmer()
3
4# Over-stemming: Different words become the same
5words = ['universe', 'university', 'universal']
6stems = [stemmer.stem(w) for w in words]
7print(stems) # ['univers', 'univers', 'univers']
8# All three map to the same stem - meaning is lost!
9
10# Under-stemming: Same root stays different
11words2 = ['absorb', 'absorption']
12stems2 = [stemmer.stem(w) for w in words2]
13print(stems2) # ['absorb', 'absorpt']
14# Related words map to different stems!
Takeaway: Use lemmatization when you need interpretable, meaningful tokens.
1from nltk.stem import PorterStemmer, SnowballStemmer
2porter = PorterStemmer()
3snowball = SnowballStemmer('english')
4
5words = ['running', 'runs', 'runner', 'ran', 'easily', 'fairly']
6
7print("Word Porter Snowball")
8for word in words:
9 print(f"{word:12} {porter.stem(word):12} {snowball.stem(word)}")
10
11# Output:
12# running run run
13# runs run run
14# runner runner runner
15# ran ran ran
16# easily easili easili
17# fairly fairli fair
1import spacy
2nlp = spacy.load("en_core_web_sm")
3
4text = "The cats are running faster than dogs ran yesterday"
5doc = nlp(text)
6
7print(f"{'Word':<12} {'Lemma':<12} {'POS'}")
8print("-" * 36)
9for token in doc:
10 print(f"{token.text:<12} {token.lemma_:<12} {token.pos_}")
11
12# Output:
13# Word Lemma POS
14# cats cat NOUN
15# running run VERB
16# dogs dog NOUN
17# ran run VERB
| Feature | spaCy | NLTK |
|---|---|---|
| Accuracy | High | Good |
| Setup | Easy | Requires data download |
| POS tagging | Built-in | Separate step |
| Dependencies | Included | Manual WordNet |
| Use case | Production | Research/Teaching |
Use spaCy for most tasks! It's faster, more accurate, and easier to use in production.
When to remove:
When to keep:
"the", "a", "is", "in", "and", "or" — language-specific lists available in NLTK and spaCy.
1from nltk.corpus import stopwords
2import spacy
3
4stop_words_nltk = set(stopwords.words('english')) # NLTK approach
5text = "This is an example showing stop word removal"
6filtered_nltk = [w for w in text.lower().split() if w not in stop_words_nltk]
7print("NLTK:", filtered_nltk) # ['example', 'showing', 'stop', 'word', 'removal']
8
9nlp = spacy.load("en_core_web_sm") # spaCy approach
10doc = nlp(text)
11filtered_spacy = [token.text for token in doc if not token.is_stop]
12print("spaCy:", filtered_spacy) # ['example', 'showing', 'stop', 'word', 'removal']
"I LOVE this product!!! Best purchase ever! See details at http://example.com"
Keep:
Remove:
"I LOVE this product! Best purchase ever!" — emotion preserved!
1import spacy, re
2
3class TextPreprocessor:
4 def __init__(self, remove_stopwords=False):
5 self.nlp = spacy.load("en_core_web_sm")
6 self.remove_stopwords = remove_stopwords
7
8 def clean(self, text):
9 text = re.sub(r'http\S+', '', text) # Remove URLs
10 text = re.sub(r'<.*?>', '', text) # Remove HTML
11 text = ' '.join(text.split()) # Normalize whitespace
12 doc = self.nlp(text)
13 tokens = [t.lemma_ for t in doc if not (self.remove_stopwords and t.is_stop)]
14 return ' '.join(tokens)
There's no one-size-fits-all solution!
Encoding:
HuggingFace Resources:
Modern resources:
Ethical considerations:
Share interesting findings with classmates!
Tokenization deep dive!
Lecture 6 — Tokenization deep dive!