* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 5: Data Cleaning & Preprocessing

PSYC 51.07: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Understand why data cleaning is critical for NLP tasks
  2. Apply common preprocessing techniques to raw text
  3. Use web scraping to collect text data
  4. Implement lemmatization and stemming
  5. Make informed decisions about preprocessing strategies

Garbage in, garbage out!

Real-world text is messy

Common problems:

  • HTML/XML tags: <p>text</p>
  • Special characters: &nbsp; &amp;
  • Inconsistent formatting
  • Extra whitespace
  • Mixed encodings (UTF-8, ASCII...)
  • Emoji and Unicode characters

Consequences of dirty data:

  • Models learn noise, not signal
  • Reduced accuracy and consistency
  • Poor generalization to new data
  • Wasted computation on junk
  • Unpredictable model behavior
  • Difficulty debugging issues

"Quality of input = quality of output"

The data cleaning pipeline

Raw Text Remove Tags Normalize Tokenize Lemmatize Clean Text
A typical data cleaning pipeline

Pipeline varies by task! Not all steps are always needed.

Cleaning messy text step by step

Raw input:


1<p>I LOVE this!!! 
2https://ex.com 
3a@b.com</p>

Steps:

  1. Remove HTML tags
  2. Remove URLs/emails
  3. Normalize whitespace

1import re
2raw = "<p>I LOVE this!!! https://ex.com a@b.com</p>"
3
4text = re.sub(r'<.*?>', '', raw)       # Step 1
5text = re.sub(r'http\S+', '', text)    # Step 2
6text = re.sub(r'\S+@\S+', '', text)
7text = ' '.join(text.split())          # Step 3
8
9print(text)  # "I LOVE this!!!"

Worked example: Complete pipeline

Full transformation:

Stage Text
Raw <p>I LOVE this product!!! https://ex.com</p>
Remove HTML I LOVE this product!!! https://ex.com
Remove URLs I LOVE this product!!!
Normalize spaces I LOVE this product!!!
Lowercase i love this product!!!
Remove extra punctuation i love this product!

Key decision points:

  • Keep emoji ? Yes for sentiment analysis (conveys emotion)
  • Keep punctuation !!!? Maybe (emphasis, but noisy)
  • Lowercase? Depends on task (lose "LOVE" emphasis)

Different tasks need different preprocessing

Task Keep Remove
Sentiment analysis Emoji, punctuation (style matters) Extra whitespace, special chars
Named entity recognition Case (proper nouns important) Extra whitespace
Topic modeling Content words only Punctuation, case (lowercase all)

Preserve information relevant to your task!

The web is a massive source of text data

What you can collect:

  • News articles and blog posts
  • Product reviews and ratings
  • Social media and forum discussions
  • Domain-specific technical content

Popular tools:

  • Beautiful Soup: Parse HTML/XML
  • Requests: Fetch web pages
  • Scrapy: Full scraping framework
  • Selenium: JavaScript-heavy sites

Check robots.txt, respect rate limits, and honor terms of service and copyright.

Web scraping with Beautiful Soup

Basic example:


1from bs4 import BeautifulSoup
2import requests
3
4# Fetch webpage
5url = "https://example.com/article"
6response = requests.get(url)
7html_content = response.content
8
9# Parse HTML
10soup = BeautifulSoup(html_content, 'html.parser')
11
12# Extract text from specific elements
13title = soup.find('h1').get_text()
14article = soup.find('article').get_text()
15clean_text = ' '.join(article.split())  # Remove extra whitespace
16print(f"Title: {title}\nArticle: {clean_text[:200]}...")

Advanced Beautiful Soup techniques


1from bs4 import BeautifulSoup
2
3html = """<div class="article">
4 <h2>Breaking News</h2>
5 <p class="content">First paragraph.</p>
6 <p class="content">Second paragraph.</p>
7 <div class="ads">Advertisement</div>
8</div>"""
9
10soup = BeautifulSoup(html, 'html.parser')
11
12paragraphs = soup.find_all('p', class_='content')  # Find content paragraphs
13text = ' '.join([p.get_text() for p in paragraphs])
14
15for ad in soup.find_all('div', class_='ads'):      # Remove unwanted sections
16    ad.decompose()
17
18article_text = soup.get_text(separator=' ', strip=True)
19print(article_text)

Always use UTF-8 encoding

Common encoding issues:

  • UTF-8 vs. ASCII vs. Latin-1
  • Special characters: é, ñ, ü
  • Emoji require Unicode support
  • Mixed encodings in scraped data

Best practices:

  • Default to UTF-8 for all files
  • Detect unknown: chardet library
  • Normalize Unicode: NFKC form
  • Fix broken text: ftfy library

Web scraping often produces mixed encodings


1import chardet
2
3raw_bytes = b'Caf\xe9 au lait \x96 delicious!'  # Unknown encoding
4
5result = chardet.detect(raw_bytes)              # Detect encoding
6print(f"Detected: {result}")  # {'encoding': 'Windows-1252', 'confidence': 0.73}
7
8text = raw_bytes.decode(result['encoding'])     # Decode with detected encoding
9print(text)                   # "Café au lait – delicious!"
10
11import ftfy                                     # Alternative: fix broken Unicode
12broken = "Cafà au lait €" delicious!"
13fixed = ftfy.fix_text(broken)
14print(fixed)                  # "Café au lait – delicious!"

Pro tip: Save all files as UTF-8 to avoid these headaches!

Five common preprocessing steps

1. HTML/XML tag removal

  • <p>Hello</p>Hello

2. Whitespace normalization

  • "Hello world\n""Hello world"

3. Punctuation handling

  • Keep for sentiment ("Great!" vs "Great")
  • Remove for topic modeling

4. Case normalization

  • Lowercase: "Apple" = "apple"
  • Preserve for NER: "Apple Inc."

5. Special character removal

  • URLs, emails, numbers
  • Task-dependent decisions

Preprocessing example


1import re
2
3def preprocess_text(text):
4    text = re.sub(r'http\S+|www.\S+', '', text)  # Remove URLs
5    text = re.sub(r'\S+@\S+', '', text)          # Remove emails
6    text = re.sub(r'<.*?>', '', text)            # Remove HTML tags
7    text = ' '.join(text.split())                # Remove extra whitespace
8    text = text.lower()                          # Lowercase
9    return text
10
11raw = "Check out https://example.com! <b>Amazing</b> deals!!"
12print(preprocess_text(raw))  # "check out amazing deals!!"

Stemming is fast but crude; lemmatization is accurate

Stemming

  • Rule-based crude chopping
  • Fast and simple
  • May produce non-words
  • Porter Stemmer (1980)
  • "running" → "run"

Lemmatization

  • Uses vocabulary + morphology
  • Slower but more accurate
  • Always produces real words
  • Requires POS context
  • "better" → "good"
  • Stemming: Speed matters, approximate matching OK
  • Lemmatization: Need interpretable, real words

Lemmatization produces real words; stemming produces fragments

Word Porter Stemmer Lemmatizer Notes
ran ran run Lemma recognizes irregular verb
runs run run Both work well
running run run Both work well
runner runner runner Both keep as-is (different word)
better better good Lemma handles irregular adjective
best best good Lemma handles superlative
geese gees goose Stemmer produces non-word!
studies studi study Stemmer produces non-word!
continued...

Lemmatization produces real words; stemming produces fragments

Word Porter Stemmer Lemmatizer Notes
organizing organ organize Stemmer too aggressive!
...continued

Key insight: Lemmatization produces real words; stemming can produce fragments.

Stemming can over-stem or under-stem


1from nltk.stem import PorterStemmer
2stemmer = PorterStemmer()
3
4# Over-stemming: Different words become the same
5words = ['universe', 'university', 'universal']
6stems = [stemmer.stem(w) for w in words]
7print(stems) # ['univers', 'univers', 'univers']
8# All three map to the same stem - meaning is lost!
9
10# Under-stemming: Same root stays different
11words2 = ['absorb', 'absorption']
12stems2 = [stemmer.stem(w) for w in words2]
13print(stems2) # ['absorb', 'absorpt']
14# Related words map to different stems!

Takeaway: Use lemmatization when you need interpretable, meaningful tokens.

Stemming with NLTK


1from nltk.stem import PorterStemmer, SnowballStemmer
2porter = PorterStemmer()
3snowball = SnowballStemmer('english')
4
5words = ['running', 'runs', 'runner', 'ran', 'easily', 'fairly']
6
7print("Word         Porter       Snowball")
8for word in words:
9    print(f"{word:12} {porter.stem(word):12} {snowball.stem(word)}")
10
11# Output:
12# running      run          run
13# runs         run          run
14# runner       runner       runner
15# ran          ran          ran
16# easily       easili       easili
17# fairly       fairli       fair

Lemmatization with spaCy


1import spacy
2nlp = spacy.load("en_core_web_sm")
3
4text = "The cats are running faster than dogs ran yesterday"
5doc = nlp(text)
6
7print(f"{'Word':<12} {'Lemma':<12} {'POS'}")
8print("-" * 36)
9for token in doc:
10    print(f"{token.text:<12} {token.lemma_:<12} {token.pos_}")
11
12# Output:
13# Word         Lemma        POS
14# cats         cat          NOUN
15# running      run          VERB
16# dogs         dog          NOUN
17# ran          run          VERB

spaCy vs. NLTK for lemmatization

Feature spaCy NLTK
Accuracy High Good
Setup Easy Requires data download
POS tagging Built-in Separate step
Dependencies Included Manual WordNet
Use case Production Research/Teaching

Use spaCy for most tasks! It's faster, more accurate, and easier to use in production.

Stop words: common words with little semantic value

When to remove:

  • Topic modeling
  • Text classification
  • Search engines
  • Information retrieval

When to keep:

  • Sentiment ("not good" ≠ "good")
  • Machine translation
  • Text generation
  • Neural models (they learn!)

"the", "a", "is", "in", "and", "or" — language-specific lists available in NLTK and spaCy.

Stop words in practice


1from nltk.corpus import stopwords
2import spacy
3
4stop_words_nltk = set(stopwords.words('english'))  # NLTK approach
5text = "This is an example showing stop word removal"
6filtered_nltk = [w for w in text.lower().split() if w not in stop_words_nltk]
7print("NLTK:", filtered_nltk)  # ['example', 'showing', 'stop', 'word', 'removal']
8
9nlp = spacy.load("en_core_web_sm")  # spaCy approach
10doc = nlp(text)
11filtered_spacy = [token.text for token in doc if not token.is_stop]
12print("spaCy:", filtered_spacy)  # ['example', 'showing', 'stop', 'word', 'removal']

Sentiment analysis requires preserving emotional signals

"I LOVE this product!!! Best purchase ever! See details at http://example.com"

Keep:

  • Punctuation (! conveys emphasis)
  • Capitalization (LOVE = strong)
  • Emoji if present

Remove:

  • URLs (no sentiment value)
  • HTML tags
  • Extra punctuation (!!! → !)

"I LOVE this product! Best purchase ever!" — emotion preserved!

Discussion: Preprocessing trade-offs

  1. Information loss: What do we lose when we lowercase everything?
    • Think: "US" (United States) vs. "us" (pronoun)
  2. Task dependency: Why does preprocessing differ by task?
    • Hint: What signals matter for sentiment vs. topic modeling?
  3. Modern models: Do transformer models still need heavy preprocessing?
    • Consider: BERT handles subwords, capitalization, punctuation...
  4. Bias introduction: Can preprocessing introduce bias?
    • Example: Removing slang might remove cultural markers

Practical tips for data cleaning

  1. Inspect your data first! — Look at samples before deciding on preprocessing
  2. Keep raw data separate — Never overwrite originals; you might need them!
  3. Document your pipeline — Track what preprocessing you applied and why
  4. Experiment! — Try different approaches, measure impact on task
  5. Validate incrementally — Check results after each preprocessing step
  6. Consider automation — Use libraries: spaCy, NLTK, HuggingFace tokenizers

Complete preprocessing pipeline example


1import spacy, re
2
3class TextPreprocessor:
4    def __init__(self, remove_stopwords=False):
5        self.nlp = spacy.load("en_core_web_sm")
6        self.remove_stopwords = remove_stopwords
7
8    def clean(self, text):
9        text = re.sub(r'http\S+', '', text)   # Remove URLs
10        text = re.sub(r'<.*?>', '', text)     # Remove HTML
11        text = ' '.join(text.split())         # Normalize whitespace
12        doc = self.nlp(text)
13        tokens = [t.lemma_ for t in doc if not (self.remove_stopwords and t.is_stop)]
14        return ' '.join(tokens)

Preprocessing checklist

  • What is my task? (classification, generation, extraction...)
  • What signals are important? (sentiment, topics, entities...)
  • Should I lowercase? (preserve case for names?)
  • How to handle punctuation? (keep for emotion?)
  • Do I need lemmatization? (or will model handle it?)
  • Should I remove stop words? (or keep for grammar?)
  • How to handle special characters? (emoji, numbers, symbols)
  • What about rare/unknown words? (keep, remove, replace?)
  • Have I validated on sample data? (inspect before/after!)

There's no one-size-fits-all solution!

Tools and libraries

Web Scraping:

Text Processing:

Encoding:

  • chardet: Character encoding detection
  • ftfy: Fixes broken Unicode

HuggingFace Resources:

Primary references

  • Porter, M. F. (1980). An algorithm for suffix stripping. Program, 14(3), 130-137. — The original Porter Stemmer algorithm
  • Manning, C. D., Raghavan, P., & Schutze, H. (2008). Introduction to Information Retrieval. Cambridge University Press. — Chapter 2: Text preprocessing fundamentals

Modern resources:

  • spaCy documentation: Industrial-strength NLP
  • HuggingFace NLP Course: Modern preprocessing with transformers

Ethical considerations:

  • Liang et al. (2020). "Towards Debiasing Sentence Representations"
  • Consider how preprocessing choices affect fairness

Hands-on exercise

  1. Choose a website (news, blog, Reddit...)
  2. Scrape 10-20 articles/posts
  3. Apply different preprocessing pipelines:
    • Minimal: Just remove HTML
    • Moderate: + normalize whitespace, lowercase
    • Heavy: + lemmatize, remove stop words
  4. Compare the results:
    • How does vocabulary size change?
    • What information is lost/preserved?
    • Which would work best for sentiment analysis? Topic modeling?

Share interesting findings with classmates!

Key takeaways

  1. Preprocessing is crucial but task-dependent — No universal pipeline; adapt to your needs!
  2. Balance cleaning vs. information loss — More preprocessing does not always mean better
  3. Stemming vs. Lemmatization — Stemming: fast, approximate; Lemmatization: slow, accurate
  4. Modern models are robust — Transformers can handle messy text better than older models
  5. Always validate — Inspect data before and after preprocessing

Tokenization deep dive!

Questions? Want to chat more?

📧 Email me
💬 Join our Discord
💁 Come to office hours

Lecture 6 — Tokenization deep dive!