* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();

Lecture 6: Tokenization

PSYC 51.07: Models of language and communication

Jeremy R. Manning
Dartmouth College
Winter 2026

Learning objectives

  1. Understand what tokenization is and why it matters
  2. Explain the limitations of word-level tokenization
  3. Describe how subword tokenization works (BPE, WordPiece, SentencePiece)
  4. Compare different tokenization methods and their trade-offs
  5. Implement and experiment with different tokenizers using HuggingFace

How do we break text into meaningful units for AI models?

What is tokenization?

Tokenization is the process of converting text into smaller units (tokens).

Why tokenize?

  • Neural networks process numbers, not text
  • Need discrete units to create vocabulary
  • First step in any NLP pipeline

What can tokens be?

  • Characters: a, b, c, ... (very fine-grained)
  • Words: "hello", "world" (intuitive but limited)
  • Subwords: "un", "happiness" (sweet spot!)
  • Bytes: 0-255 (most fine-grained)

"The granularity of tokenization determines what a model can learn"

The tokenization spectrum

Character Subword Word
Fine-grained to coarse-grained tokenization
Granularity Vocabulary size Sequence length
Character ~100s Very long
Subword 30k-50k Medium
Word 100k+ Short

Vocabulary size vs. sequence length

Splitting on spaces fails for three reasons

English has ~170,000 words, each inflection counts separately ("run", "runs", "running", "ran"), and compounds vary ("ice cream", "icecream", "ice-cream")

OOV for new terms ("COVID-19", "selfie"), rare words ("supercalifragilisticexpialidocious"), and typos ("teh")

Chinese: 没有空格 | Japanese: 日本語も同様

Word-level limitations illustrated

Word-level Subword-level Character-level
Vocabulary growth as training data increases

Word-level vocabulary keeps growing! Subword vocabulary stabilizes at a reasonable size.

The OOV problem in action


1# Simple word-level vocabulary
2vocab = {"hello", "world", "the", "cat", "sat"}
3
4def tokenize_word_level(text, vocab):
5 tokens = text.lower().split()
6 result = []
7 for token in tokens:
8 if token in vocab:
9 result.append(token)
10 else:
11 result.append("<UNK>") # Unknown token
12 return result
13
14# Example
15text = "Hello! The cat jumped"
16tokens = tokenize_word_level(text, vocab)
17print(tokens)
18# Output: ['hello', 'the', 'cat', '<UNK>']
19# ^^^ "jumped" is unknown!

We lose information! "jumped" becomes meaningless <UNK>

Most words are made of smaller meaningful pieces

Examples:

  • "unhappiness" = "un" + "happiness"
  • "preprocessing" = "pre" + "process" + "ing"
  • "antiestablishment" = "anti" + "establish" + "ment"

Benefits:

  • Fixed vocabulary (30k-50k tokens)
  • No OOV: break into known parts
  • Captures morphology (prefixes, suffixes)

Learn common character sequences from data! Works for any language.

Subword tokenization solves the OOV problem


1# Word-level vocabulary (only words seen in training)
2vocab = {"the", "cat", "sat", "on", "mat", "dog", "ran"}
3
4# Trying to tokenize a new sentence:
5sentence = "The supercalifragilisticexpialidocious cat meowed"
6# Word-level result: ["<UNK>", "<UNK>", "cat", "<UNK>"]
7# We lost almost everything!

The subword solution:


1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("gpt2")
3
4# Same difficult sentence:
5tokens = tokenizer.tokenize("supercalifragilisticexpialidocious")
6print(tokens)
7# ['super', 'cal', 'if', 'rag', 'il', 'istic', 'exp', 'ial', 'id', 'ocious']
8
9# Every word can be tokenized! No information loss.
10# Model can learn that 'super-' often means "very/above"

Subwords preserve meaning even for novel words!

Visual: How "unhappiness" gets tokenized

unhappiness Word-level unhappiness un happi ness
Word-level (1 token, may be OOV) vs. Subword BPE (3 tokens, always known)

More tokens = longer sequences, but smaller vocabulary and no OOV!

Byte-Pair Encoding (BPE)

Sennrich, Haddow, & Birch (2016, ACL): Neural Machine Translation of Rare Words with Subword Units

  • Original use: Data compression (1994)
  • Adapted for NLP: Sennrich et al. (2016)
  • Used by: GPT, GPT-2, GPT-3, RoBERTa, BART, many others!

Iteratively merge the most frequent pair of characters/subwords.

  1. Start with character-level vocabulary
  2. Count all adjacent pairs in corpus
  3. Merge most frequent pair → create new token
  4. Repeat until desired vocabulary size

BPE example: Step by step

Training data: "low low low lower lower newest newest newest newest widest"

Initial tokens (characters): l, o, w, e, r, n, s, t, i, d

Most frequent pair = e, s (appears 4 times)

  • Merge → new token: es
  • Vocabulary: l, o, w, e, r, n, s, t, i, d, es

Most frequent = es, t

  • Merge → new token: est
  • Vocabulary: l, o, w, e, r, n, s, t, i, d, es, est

Most frequent = l, o

  • Merge → new token: lo

Continue until reaching target vocabulary size (e.g., 30,000)...

BPE learns to merge frequent character pairs


1Training corpus: "low" (x3), "lower" (x2), "newest" (x4), "widest" (x1)
2
3After training, vocabulary includes:
4- Characters: l, o, w, e, r, n, s, t, i, d
5- Merges learned: es → est → lo → low → er → ...
6
7Tokenizing "lowest":
8 Step 1: Split into characters → [l, o, w, e, s, t]
9 Step 2: Apply merge rules in order learned:
10 [l, o, w, e, s, t]
11 → [lo, w, e, s, t] (merge l+o)
12 → [low, e, s, t] (merge lo+w)
13 → [low, es, t] (merge e+s)
14 → [low, est] (merge es+t)
15
16 Final tokens: ["low", "est"]

Result: "lowest" → ["low", "est"] (2 tokens instead of 6 characters!)

BPE visualization

l o w e s t
Step 0: Character-level
lo w est
After merges: "lowest" becomes 3 tokens

"lowest" → [lo, w, est] — captures common patterns without explicit linguistic rules!

BPE in practice with HuggingFace


1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("gpt2")  # GPT-2 uses BPE
3
4text = "I'm learning about tokenization!"
5tokens = tokenizer.tokenize(text)
6print("Tokens:", tokens)
7# ['I', "'m", 'Ġlearning', 'Ġabout', 'Ġtoken', 'ization', '!']
8# Note: 'Ġ' represents space
9
10token_ids = tokenizer.encode(text)  # Get token IDs
11print("Token IDs:", token_ids)  # [40, 1101, 4673, 546, 11241, 1634, 0]
12
13decoded = tokenizer.decode(token_ids)  # Decode back
14print("Decoded:", decoded)  # "I'm learning about tokenization!"

Use the Tokenization Explorer Demo to experiment with different tokenizers interactively.

WordPiece merges "surprisingly common" pairs

Wu et al. (2016, arXiv): Google's Neural Machine Translation System

Instead of merging most frequent pair, merge pair that maximizes likelihood:

score(x,y)=P(xy)P(x)×P(y)\text{score}(x, y) = \frac{P(xy)}{P(x) \times P(y)}

  • Used by: BERT, DistilBERT, Electra
  • Special tokens: ## prefix for continuation ("playing" → ["play", "##ing"])

WordPiece example with BERT


1from transformers import AutoTokenizer
2
3# BERT uses WordPiece
4tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
5
6text = "I'm learning about tokenization!"
7
8tokens = tokenizer.tokenize(text)
9print("Tokens:", tokens)
10# Output: ['i', "'", 'm', 'learning', 'about', 'token', '##ization', '!']
11# ^^^ Note the ##
12
13# Compare with longer word
14text2 = "unhappiness"
15tokens2 = tokenizer.tokenize(text2)
16print("Tokens:", tokens2)
17# Output: ['un', '##hap', '##pin', '##ess']
18# Breaks into morphological components!

BERT lowercases by default (unless using cased model)

SentencePiece works for languages without spaces

Kudo & Richardson (2018, EMNLP): SentencePiece: A simple and language independent subword tokenizer

Problem with BPE/WordPiece:

  • Assume pre-tokenized text
  • Fail on Chinese, Japanese, Thai...

SentencePiece solution:

  • Raw character stream input
  • Space = just another char ()

Used by: T5, ALBERT, XLNet, mT5 (multilingual models)

SentencePiece in action


1from transformers import AutoTokenizer
2
3# T5 uses SentencePiece
4tokenizer = AutoTokenizer.from_pretrained("t5-small")
5
6text = "I'm learning about tokenization!"
7
8tokens = tokenizer.tokenize(text)
9print("Tokens:", tokens)
10# Output: ['I', "'", 'm', 'learning', 'about', 'token', 'ization', '!']
11# ^^^ Note the for spaces
12
13# Works seamlessly with other languages!
14text_chinese = ""
15tokens_cn = tokenizer.tokenize(text_chinese)
16print("Chinese:", tokens_cn)
17# Breaks into characters/subwords without needing pre-tokenization

No language-specific preprocessing required!

Tokenization methods comparison

Method Approach Used By Key Feature
BPE Frequency merging GPT-2, RoBERTa Bottom-up
WordPiece Likelihood merging BERT, DistilBERT Principled scoring
SentencePiece (Unigram) Top-down pruning T5, mT5 Multilingual
SentencePiece (BPE) Bottom-up mBART Works with raw text

Comparing tokenizers side-by-side


1from transformers import AutoTokenizer
2text = "The unhappiest researchers couldn't preprocess data!"
3
4models = {"GPT-2 (BPE)": "gpt2", "BERT (WordPiece)": "bert-base-uncased",
5          "T5 (SentencePiece)": "t5-small"}
6
7for name, model_name in models.items():
8    tokenizer = AutoTokenizer.from_pretrained(model_name)
9    tokens = tokenizer.tokenize(text)
10    print(f"{name}: Tokens ({len(tokens)}): {tokens}")
11
12# Observe: Different handling of "unhappiest", spaces, and token counts!

Tokenizer comparison: Actual output

Input: "The unhappiest researchers couldn't preprocess data!"

Tokenizer Tokens Count
GPT-2 (BPE) ['The', 'Ġun', 'happ', 'iest', 'Ġresearchers', 'Ġcouldn', "'t", 'Ġpre', 'process', 'Ġdata', '!'] 11
BERT (WordPiece) ['the', 'un', '##hap', '##pie', '##st', 'researchers', 'couldn', "'", 't', 'pre', '##process', 'data', '!'] 13
T5 (SentencePiece) ['The', 'un', 'happiest', 'researchers', 'couldn', "'", 't', 'pre', 'process', 'data', '!'] 11
  • Ġ (GPT-2) and (T5) mark word starts (spaces)
  • ## (BERT) marks continuation subwords
  • "unhappiest" is split differently by each
  • BERT lowercases; GPT-2/T5 preserve case

Tokenizers add special tokens for model-specific purposes

Token Purpose Used By
[CLS] Classification token (start) BERT
[SEP] Separator between segments BERT
[PAD] Padding to same length Most models
[UNK] Unknown/rare tokens Most models
[MASK] Masked token for training BERT
<s>, </s> Start/end of sequence GPT-2, T5
<|endoftext|> Document boundary GPT-2

[CLS] The cat sat [SEP] The dog ran [SEP]

Working with special tokens


1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
3
4text = "Hello world"
5encoded = tokenizer(text, return_tensors="pt")
6print("Input IDs:", encoded['input_ids'])  # [101, 7592, 2088, 102]
7# ^^^ [CLS] and [SEP] added automatically!
8
9full_decode = tokenizer.decode(encoded['input_ids'][0])
10print("With special:", full_decode)  # "[CLS] hello world [SEP]"
11
12clean_decode = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)
13print("Without special:", clean_decode)  # "hello world"

Vocabulary size trade-offs

Smaller vocabulary (e.g., 10k tokens):

  • Faster training (smaller embedding matrix)
  • Less memory
  • Longer sequences (more subwords per word)
  • May lose semantic information

Larger vocabulary (e.g., 100k tokens):

  • Shorter sequences (closer to word-level)
  • Better semantic preservation
  • Slower training (larger embeddings)
  • More memory required

30k-50k tokens for most models: GPT-2: 50k | BERT: 30k | T5: 32k

Tokenization pitfalls and gotchas

  1. Tokenizer-model mismatch: Always use the tokenizer that matches your model! GPT-2 tokenizer ≠ BERT tokenizer
  2. Maximum sequence length: BERT: 512 tokens | GPT-2: 1024 | GPT-3: 2048 — Text gets truncated if too long!
  3. Case sensitivity: bert-base-uncased lowercases everything; bert-base-cased preserves case
  4. Rare words → many tokens: "antidisestablishmentarianism" → 10+ tokens — Can hit sequence limit faster than expected!
  5. Special characters: Emoji, Unicode, accents may be split unexpectedly

Debugging tokenization


1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("gpt2")
3text = "The antidisestablishmentarianism debate continues!"
4
5tokens = tokenizer.tokenize(text)
6token_ids = tokenizer.encode(text)
7print(f"Tokens ({len(tokens)}): {tokens}\n")
8
9for i, (token, token_id) in enumerate(zip(tokens, token_ids)):
10    decoded = tokenizer.decode([token_id])
11    print(f"{i:2d}. ID {token_id:5d} | Token: {token:20s} | Decoded: {decoded}")
12
13print(f"\nVocabulary size: {tokenizer.vocab_size}")
14print(f"Special tokens: {tokenizer.all_special_tokens}")

Connection to human language learning

Saffran, Aslin, & Newport (1996, Science): Statistical learning by 8-month-old infants

Infant learning:

  • Babies track statistical regularities in speech
  • Identify word boundaries from transitional probabilities
  • No explicit rules—just patterns from exposure!

Parallel with subword tokenization:

  • BPE: Merge frequent character pairs → discover common morphemes
  • Infants: Track frequent syllable pairs → discover words
  • Both are data-driven, not rule-based!

Infants: Online, real-time, multimodal (sound + context) | BPE: Batch, text-only, no grounding

Statistical learning in action

Infant Learning:

Input stream: bidakupadotigolabu...

Learn high-probability sequences:

  • bi-da-ku (word)
  • pa-do-ti (word)

Detect low-probability boundaries:

  • ku | pa (boundary)

BPE Learning:

Input corpus: low low lower...

Merge high-frequency pairs:

  • l+olo
  • lo+wlow

Build vocabulary:

  • low, lower, lowest

Both use distributional statistics to discover structure!

Hands-on exercise

  1. Choose 3 models: GPT-2, BERT, T5
  2. Test on diverse texts:
    • Standard English: "The cat sat on the mat"
    • Complex words: "antidisestablishmentarianism"
    • Contractions: "I'm, you're, won't"
    • Typos: "teh qiuck brown fox"
    • Emoji: "I love this!"
    • Other languages: "这是中文" (Chinese)
  3. Compare results: Number of tokens, how words are split, handling of unknown/rare words
  4. Reflect: Which tokenizer works best for your use case? What are the trade-offs?

Use the Tokenization Explorer Demo to compare tokenizers interactively!

Discussion questions

  1. Linguistics vs. Statistics: BPE discovers morphemes (un-, -ing, -ness) without linguistic rules. Is this "learning" morphology, or just pattern matching?

  2. Cross-lingual tokenization: Should we use the same tokenizer for all languages? What are the trade-offs?

  3. Semantic preservation: Does breaking "unhappy" into ["un", "happy"] preserve meaning? What about "butterfly"?

  4. Human vs. machine: Humans don't consciously tokenize words. Why do machines need to?

  5. Future directions: Will we move toward character-level or byte-level models that don't need tokenization?

Primary references

Key takeaways

  1. Tokenization is fundamental: Bridges raw text and neural networks
  2. Word-level has major limitations: OOV problem, vocabulary explosion, language-dependent
  3. Subword tokenization is the sweet spot: Balanced vocabulary size and sequence length
  4. Different methods, similar principles: BPE: frequency-based | WordPiece: likelihood | SentencePiece: language-agnostic
  5. Statistical learning connects humans and machines: Both discover structure from distributional patterns
  6. Always match tokenizer to model! Critical for correct predictions

Looking ahead

How tokenization connects:

  • POS tagging: Token-level classification
  • Sentiment: Sequence-level classification
  • Both depend on good tokenization!
  • Experimenting with HuggingFace tokenizers
  • Thinking about: How does token granularity affect downstream tasks?
  • Exploring: Tokenizer artifacts and their impact

Questions? Want to chat more?

📧 Email me
💬 Join our Discord
💁 Come to office hours

Lecture 7 — X-Hour: Text Classification Workshop