* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 6: Tokenization

PSYC 51.17: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Understand what tokenization is and why it matters
  2. Explain the limitations of word-level tokenization
  3. Describe how subword tokenization works (BPE, WordPiece, SentencePiece)
  4. Compare different tokenization methods and their trade-offs
  5. Implement and experiment with different tokenizers using HuggingFace

How do we break text into meaningful units (for analyzing text, AI models, etc.)?

What is tokenization?

Tokenization is the process of converting text into smaller units (tokens).

The granularity of tokenization determines what a model can learn

Why tokenize?

  • Neural networks process numbers, not text
  • Need discrete units to create vocabulary
  • Represent text efficiently (compression!)
  • Enable learning patterns at different levels
  • First step in (nearly) any NLP pipeline

What can tokens be?

  • Bytes: 0-255 (most fine-grained)
  • Characters: a, b, c, ... (very fine-grained)
  • Subwords: "un", "happiness" (sweet spot!)
  • Words: "hello", "world" (intuitive but limited)

The tokenization spectrum

Character Subword Word Multi-word
Fine-grained to coarse-grained tokenization
Granularity Vocabulary size Sequence length
Character ~100s Very long
Subword 30k-50k Medium
Word 100k+ Short
Multi-word Millions Very short (rarely used in practice)

Splitting on spaces fails for three reasons

English has ~170,000 commonly used words, where each inflection counts separately ("run", "runs", "running", "ran"), and compounds vary ("ice cream", "icecream", "ice-cream")

New terms ("COVID-19", "selfie"), rare words ("supercalifragilisticexpialidocious"), and typos ("teh") may be missing from training data, leading to out-of-vocabulary (OOV) issues.

Chinese: 没有空格 | Japanese: 日本語も同様 | Thai: เหมือนกัน | Emojis: 😊🚀

Word-level limitations illustrated

Word-level Subword-level Character-level
Vocabulary growth as training data increases

Word-level vocabulary keeps growing! Subword vocabulary stabilizes at a reasonable size.

The OOV problem in action


1# Simple word-level vocabulary
2vocab = {"hello", "world", "the", "cat", "sat"}
3
4def tokenize_word_level(text, vocab):
5 tokens = text.lower().split()
6 result = []
7 for token in tokens:
8   if token in vocab:
9     result.append(token)
10   else:
11     result.append("<UNK>") # Unknown token
12 return result
13
14# Example
15text = "Hello! The cat jumped"
16tokens = tokenize_word_level(text, vocab)
17print(tokens)
18# Output: ['hello', 'the', 'cat', '<UNK>']
19# ^^^ "jumped" is unknown!

Subword tokenization to the rescue: most words are made of smaller meaningful pieces

Examples:

  • "unhappiness" = "un" + "happiness"
  • "preprocessing" = "pre" + "process" + "ing"
  • "antiestablishment" = "anti" + "establish" + "ment"
  • "running" = "run" + "ning"
  • "cats" = "cat" + "s"

Benefits:

  • Fixed vocabulary (30k-50k tokens)
  • No OOV: break into known parts (at the character level if needed)
  • Captures morphology (prefixes, suffixes)
  • Language-agnostic (works for many languages)

Learn common character sequences from data! Works for any language.

Subword tokenization solves the OOV problem


1# Word-level vocabulary (only words seen in training)
2vocab = {"the", "cat", "sat", "on", "mat", "dog", "ran"}
3
4# Trying to tokenize a new sentence:
5sentence = "The supercalifragilisticexpialidocious cat meowed"
6# Word-level result loses almost everything: ["<UNK>", "<UNK>", "cat", "<UNK>"]
7
8# The subword solution
9from transformers import AutoTokenizer
10tokenizer = AutoTokenizer.from_pretrained("gpt2")
11
12# Same difficult sentence:
13tokens = tokenizer.tokenize("supercalifragilisticexpialidocious")
14print(tokens)
15# ['super', 'cal', 'if', 'rag', 'il', 'istic', 'exp', 'ial', 'id', 'ocious']
16
17# Every word can be tokenized! No information loss.
18# Model can learn that 'super-' often means "very/above"

Visual: How "unhappiness" gets tokenized

unhappiness Word-level unhappiness un happi ness
Word-level (1 token, may be OOV) vs. Subword BPE (3 tokens, always known)

More tokens = longer sequences, but smaller vocabulary and no OOV!

Byte-Pair Encoding (BPE)

Sennrich, Haddow, & Birch (2016, ACL): Neural machine translation of rare words with subword units

  • Original use: Data compression (1994)
  • Adapted for NLP: Sennrich et al. (2016)
  • Used by: GPT, GPT-2, GPT-3, RoBERTa, BART, many others!

Iteratively merge the most frequent pair of characters/subwords.

  1. Start with character-level vocabulary
  2. Count all adjacent pairs in corpus
  3. Merge most frequent pair → create new token
  4. Repeat until desired vocabulary size

BPE example: Step by step

Training data: "low low low lower lower newest newest newest newest widest"

Initial tokens (characters): l, o, w, e, r, n, s, t, i, d

Most frequent pair = e, s (appears 5 times)

  • Merge → new token: es
  • Vocabulary: l, o, w, e, r, n, s, t, i, d, es

Most frequent = o, w (appears 5 times)

  • Merge → new token: ow
  • Vocabulary: l, o, w, e, r, n, s, t, i, d, es, ow

Most frequent = ␣, n (space + n; appears 4 times)

  • Merge → new token: ␣n
  • Vocabulary: l, o, w, e, r, n, s, t, i, d, es, ow, ␣n

...continue until we hit the target vocabulary size (e.g., 30,000) or run out of merges.

BPE in practice with HuggingFace


1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("gpt2")  # GPT-2 uses BPE
3
4text = "I'm learning about tokenization!"
5tokens = tokenizer.tokenize(text)
6print("Tokens:", tokens)
7# ['I', "'m", 'Ġlearning', 'Ġabout', 'Ġtoken', 'ization', '!']
8# Note: 'Ġ' represents space
9
10token_ids = tokenizer.encode(text)  # Get token IDs
11print("Token IDs:", token_ids)  # [40, 1101, 4673, 546, 11241, 1634, 0]
12
13decoded = tokenizer.decode(token_ids)  # Decode back
14print("Decoded:", decoded)  # "I'm learning about tokenization!"

Use the Tokenization Explorer Demo to experiment with different tokenizers interactively.

WordPiece merges "surprisingly common" pairs

Wu et al. (2016, arXiv): Google's neural machine translation system

Instead of merging most frequent pair, merge pair that maximizes likelihood:

score(x,y)=P(xy)P(x)P(y)\text{score}(x, y) = \frac{P(xy)}{P(x) P(y)}

  • Used by: BERT, DistilBERT, Electra
  • Special tokens: ## prefix for continuation

"playing" → ["play", "##ing"]

WordPiece example with BERT


1from transformers import AutoTokenizer
2
3# BERT uses WordPiece
4tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
5
6text = "I'm learning about tokenization!"
7
8tokens = tokenizer.tokenize(text)
9print("Tokens:", tokens)
10# Output: ['i', "'", 'm', 'learning', 'about', 'token', '##ization', '!']
11# ^^^ Note the ##. Also: BERT lowercases by default (unless using cased model)
12
13# Compare with longer word
14text2 = "unhappiness"
15tokens2 = tokenizer.tokenize(text2)
16print("Tokens:", tokens2)
17# Output: ['un', '##hap', '##pin', '##ess']
18# Breaks into morphological components!

SentencePiece works for languages without spaces

Kudo & Richardson (2018, EMNLP): SentencePiece: a simple and language independent subword tokenizer

Provilkov, Emelianeko, & Voita (2019, arXiv): BPE-Dropout: simple and effective subword regularization

  • Assumes pre-tokenized text (spaces between words)
  • Fails on Chinese, Japanese, Thai...
  • Raw character stream input
  • Spaces are just "another character" ()

Used by: T5, ALBERT, XLNet, mT5 (multilingual models)

SentencePiece in action


1from transformers import AutoTokenizer
2
3# T5 uses SentencePiece
4tokenizer = AutoTokenizer.from_pretrained("t5-small")
5
6text = "I'm learning about tokenization!"
7
8tokens = tokenizer.tokenize(text)
9print("Tokens:", tokens)
10# Output: ['I', "'", 'm', 'learning', 'about', 'token', 'ization', '!']
11# ^^^ Note the for spaces
12
13# Works seamlessly with other languages!
14text_chinese = "这是中文"
15tokens_cn = tokenizer.tokenize(text_chinese)
16print("Chinese:", tokens_cn)
17# Output: ['这', '是', '中', '文']
18# Breaks into characters/subwords without needing pre-tokenization

Tokenization methods comparison

Method Approach Used By Key Feature
BPE Frequency merging GPT-2, RoBERTa Bottom-up
WordPiece Likelihood merging BERT, DistilBERT Principled scoring
SentencePiece (Unigram) Top-down pruning T5, mT5 Multilingual
SentencePiece (BPE) Bottom-up mBART Works with raw text

Use the Tokenization Explorer Demo to compare tokenizers interactively:

  • Explore how different tokenizers split the same text
  • Examine trade-offs between how different methods handle diffrent types of text (e.g., how efficiently they tokenize different passages)
  • Look at each tokenizer's vocabulary and special tokens

Comparing tokenizers side-by-side (in Python)


1from transformers import AutoTokenizer
2text = "The unhappiest researchers couldn't preprocess data!"
3
4models = {"GPT-2 (BPE)": "gpt2", "BERT (WordPiece)": "bert-base-uncased", "T5 (SentencePiece)": "t5-small"}
5
6for name, model_name in models.items():
7    tokenizer = AutoTokenizer.from_pretrained(model_name)
8    tokens = tokenizer.tokenize(text)
9    print(f"{name}: Tokens ({len(tokens)}): {tokens}")
10
11# Observe: Different handling of "unhappiest", spaces, and token counts!

Tokenizers add special tokens for model-specific purposes

Token Purpose Used By
[CLS] Classification token (start) BERT
[SEP] Separator between segments BERT
[PAD] Padding to same length Most models
[UNK] Unknown/rare tokens Most models
[MASK] Masked token for training BERT
<s>, </s> Start/end of sequence GPT-2, T5
<|endoftext|> Document boundary GPT-2

[CLS] The cat sat [SEP] The dog ran [SEP]

Working with special tokens


1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
3
4text = "Hello world"
5encoded = tokenizer(text, return_tensors="pt")
6print("Input IDs:", encoded['input_ids'])  # [101, 7592, 2088, 102]
7# ^^^ [CLS] and [SEP] added automatically!
8
9full_decode = tokenizer.decode(encoded['input_ids'][0])
10print("With special:", full_decode)  # "[CLS] hello world [SEP]"
11
12clean_decode = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)
13print("Without special:", clean_decode)  # "hello world"

Tokenization pitfalls and gotchas

  1. Tokenizer-model mismatch: Always use the tokenizer that matches your model! GPT-2 tokenizer ≠ BERT tokenizer
  2. Maximum sequence length: BERT: 512 tokens | GPT-2: 1024 | GPT-3: 2048 — Text gets truncated if too long!
  3. Case sensitivity: bert-base-uncased lowercases everything; bert-base-cased preserves case
  4. Rare words → many tokens: "antidisestablishmentarianism" → 10+ tokens; can hit sequence limit faster than expected!
  5. Special characters: Emoji, Unicode, accents may be split unexpectedly

Connection to human language learning

Saffran, Aslin, & Newport (1996, Science): Statistical learning by 8-month-old infants

Infant learning:

  • Babies track statistical regularities in speech
  • Identify word boundaries from transitional probabilities
  • No built-in rules, just patterns learned through experience!
  • Online, incremental, multimodal

BPE subword tokenization:

  • Merge frequent character pairs, discover subwords
  • Tracks which sequences often co-occur in training data
  • No built-in rules, just patterns learned from data!
  • Offline batch processing, unimodal (text only)

Discussion questions

  1. Linguistics vs. Statistics: BPE discovers morphemes (un-, -ing, -ness) without linguistic rules. Is this "learning" morphology, or just pattern matching?

  2. Cross-lingual tokenization: should we use the same tokenizer for all languages? What are the trade-offs?

  3. Semantic preservation: does breaking "unhappy" into ["un", "happy"] preserve meaning? What about "butterfly"?

  4. Human vs. machine: humans don't consciously tokenize words. Why do machines need to?

  5. Future directions: will we move toward character-level or byte-level models that don't need tokenization?

Key takeaways

  1. Tokenization is fundamental: turns raw text into model-ready input
  2. Word-level has major limitations: OOV problem, vocabulary explosion, language-dependent
  3. Subword tokenization is the sweet spot: Balanced vocabulary size and sequence length
  4. Different methods, similar principles: BPE: frequency-based | WordPiece: likelihood | SentencePiece: language-agnostic
  5. Statistical learning connects humans and machines: both discover structure from distributional patterns
  6. Always match tokenizer to model! critical for correct predictions

Looking ahead

How tokenization connects:

  • POS tagging: Token-level classification
  • Sentiment: Sequence-level classification
  • Both depend on good tokenization!

Questions? Want to chat more?

📧 Email me
💬 Join our Discord
💁 Come to office hours

Remember that the ELIZA assignment is due on Friday at 11:59 PM — reach out if you have any questions or need help!