* 2. Include this script: * 3. Create charts with minimal configuration - colors are auto-applied! */ (function() { 'use strict'; // ========================================================================== // READ COLORS FROM CSS CUSTOM PROPERTIES // This ensures chart colors stay in sync with the theme // ========================================================================== /** * Get a CSS custom property value from :root */ function getCSSVar(name, fallback = '') { if (typeof getComputedStyle === 'undefined') return fallback; const value = getComputedStyle(document.documentElement).getPropertyValue(name).trim(); return value || fallback; } /** * Build palette from CSS custom properties (with fallbacks) */ function buildPaletteFromCSS() { return { // Primary brand colors dartmouthGreen: getCSSVar('--dartmouth-green', '#00693e'), textPrimary: getCSSVar('--text-primary', '#0a2518'), textSecondary: getCSSVar('--text-secondary', '#0a3d23'), // Chart colors (from CSS --chart-color-N variables) chartColors: [ getCSSVar('--chart-color-1', '#00693e'), getCSSVar('--chart-color-2', '#267aba'), getCSSVar('--chart-color-3', '#ffa00f'), getCSSVar('--chart-color-4', '#9d162e'), getCSSVar('--chart-color-5', '#8a6996'), getCSSVar('--chart-color-6', '#a5d75f'), getCSSVar('--chart-color-7', '#003c73'), getCSSVar('--chart-color-8', '#d94415'), getCSSVar('--chart-color-9', '#643c20'), getCSSVar('--chart-color-10', '#c4dd88'), getCSSVar('--chart-color-11', '#f5dc69'), getCSSVar('--chart-color-12', '#424141'), ], // Background colors (semi-transparent versions) chartBgColors: [ getCSSVar('--chart-bg-1', 'rgba(0, 105, 62, 0.5)'), getCSSVar('--chart-bg-2', 'rgba(38, 122, 186, 0.5)'), getCSSVar('--chart-bg-3', 'rgba(255, 160, 15, 0.5)'), getCSSVar('--chart-bg-4', 'rgba(157, 22, 46, 0.5)'), getCSSVar('--chart-bg-5', 'rgba(138, 105, 150, 0.5)'), getCSSVar('--chart-bg-6', 'rgba(165, 215, 95, 0.5)'), ], // Semantic colors positive: getCSSVar('--chart-positive', '#00693e'), negative: getCSSVar('--chart-negative', '#9d162e'), neutral: getCSSVar('--chart-neutral', '#424141'), highlight: getCSSVar('--chart-highlight', '#ffa00f'), // Grid and axis colors gridLight: getCSSVar('--chart-grid-light', 'rgba(0, 105, 62, 0.1)'), gridMedium: getCSSVar('--chart-grid-medium', 'rgba(0, 105, 62, 0.15)'), gridDark: getCSSVar('--chart-grid-dark', 'rgba(0, 105, 62, 0.2)'), axisColor: getCSSVar('--chart-axis-color', '#0a2518'), // Font fontFamily: getCSSVar('--chart-font-family', "'Avenir LT Std', 'Avenir', 'Avenir Next', -apple-system, BlinkMacSystemFont, sans-serif"), }; } // Initialize palette (will be populated when DOM is ready) let CDL_PALETTE = null; // For convenience, expose primary chart colors array let CHART_COLORS = null; // ========================================================================== // FONT CONFIGURATION // Responsive font sizes based on typical Marp slide dimensions (1280x720) // ========================================================================== const FONT_CONFIG = { sizes: { title: 22, // Chart title subtitle: 18, // Subtitle legend: 16, // Legend labels axisTitle: 18, // Axis titles axisTicks: 16, // Axis tick labels tooltip: 14, // Tooltip text dataLabels: 14, // Data labels on charts }, weight: { normal: 400, medium: 500, bold: 600, }, }; // ========================================================================== // HELPER FUNCTIONS // ========================================================================== /** * Ensure palette is initialized */ function ensurePalette() { if (!CDL_PALETTE) { CDL_PALETTE = buildPaletteFromCSS(); CHART_COLORS = CDL_PALETTE.chartColors; } return CDL_PALETTE; } /** * Get color for a dataset at given index * Cycles through palette if more datasets than colors */ function getColor(index) { ensurePalette(); return CHART_COLORS[index % CHART_COLORS.length]; } /** * Get color with alpha transparency */ function getColorWithAlpha(color, alpha) { // Handle hex colors if (color.startsWith('#')) { const r = parseInt(color.slice(1, 3), 16); const g = parseInt(color.slice(3, 5), 16); const b = parseInt(color.slice(5, 7), 16); return `rgba(${r}, ${g}, ${b}, ${alpha})`; } // Handle rgba colors if (color.startsWith('rgba')) { return color.replace(/[\d.]+\)$/, `${alpha})`); } return color; } /** * Generate colors for all datasets in chart data * Automatically assigns colors if not specified */ function autoAssignColors(data, chartType) { if (!data || !data.datasets) return data; data.datasets.forEach((dataset, index) => { const baseColor = getColor(index); // Only assign colors if not already specified switch (chartType) { case 'bar': case 'horizontalBar': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'line': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.1); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 3; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 6; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } if (dataset.tension === undefined) { dataset.tension = 0.3; } break; case 'scatter': case 'bubble': if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 15; } if (dataset.pointHoverRadius === undefined) { dataset.pointHoverRadius = 18; } break; case 'pie': case 'doughnut': case 'polarArea': // For pie charts, we need multiple colors for one dataset if (!dataset.backgroundColor) { const numItems = dataset.data ? dataset.data.length : 6; dataset.backgroundColor = []; for (let i = 0; i < numItems; i++) { dataset.backgroundColor.push(getColor(i)); } } if (!dataset.borderColor) { dataset.borderColor = '#d8d8d8'; // Slide background } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } break; case 'radar': if (!dataset.borderColor) { dataset.borderColor = baseColor; } if (!dataset.backgroundColor) { dataset.backgroundColor = getColorWithAlpha(baseColor, 0.2); } if (dataset.borderWidth === undefined) { dataset.borderWidth = 2; } if (dataset.pointRadius === undefined) { dataset.pointRadius = 4; } if (!dataset.pointBackgroundColor) { dataset.pointBackgroundColor = baseColor; } break; default: // Generic color assignment if (!dataset.backgroundColor) { dataset.backgroundColor = baseColor; } if (!dataset.borderColor) { dataset.borderColor = baseColor; } } }); return data; } // ========================================================================== // CHART.JS GLOBAL DEFAULTS // ========================================================================== function applyGlobalDefaults() { if (typeof Chart === 'undefined') { console.warn('Chart.js not loaded. chart-defaults.js requires Chart.js to be loaded first.'); return false; } // Ensure palette is loaded from CSS const palette = ensurePalette(); // Font defaults Chart.defaults.font.family = palette.fontFamily; Chart.defaults.font.size = FONT_CONFIG.sizes.axisTicks; Chart.defaults.color = palette.textPrimary; // Responsive defaults Chart.defaults.responsive = true; Chart.defaults.maintainAspectRatio = false; // Animation (subtle) Chart.defaults.animation.duration = 400; // Plugin defaults // Legend Chart.defaults.plugins.legend.labels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, weight: FONT_CONFIG.weight.normal, }; Chart.defaults.plugins.legend.labels.color = palette.textPrimary; Chart.defaults.plugins.legend.labels.usePointStyle = true; Chart.defaults.plugins.legend.labels.padding = 20; // Title Chart.defaults.plugins.title.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.title, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.title.color = palette.textPrimary; // Tooltip Chart.defaults.plugins.tooltip.backgroundColor = palette.textPrimary; Chart.defaults.plugins.tooltip.titleFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, weight: FONT_CONFIG.weight.medium, }; Chart.defaults.plugins.tooltip.bodyFont = { family: palette.fontFamily, size: FONT_CONFIG.sizes.tooltip, }; Chart.defaults.plugins.tooltip.cornerRadius = 4; Chart.defaults.plugins.tooltip.padding = 10; // Scale defaults (for cartesian charts) // These need to be applied per-scale type const scaleDefaults = { grid: { color: palette.gridLight, lineWidth: 1, }, border: { color: palette.gridDark, width: 1, }, ticks: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }, color: palette.textPrimary, }, title: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTitle, weight: FONT_CONFIG.weight.normal, }, color: palette.textPrimary, }, }; // Apply scale defaults to linear scale if (Chart.defaults.scales && Chart.defaults.scales.linear) { if (Chart.defaults.scales.linear.grid) Object.assign(Chart.defaults.scales.linear.grid, scaleDefaults.grid); if (Chart.defaults.scales.linear.border) Object.assign(Chart.defaults.scales.linear.border, scaleDefaults.border); if (Chart.defaults.scales.linear.ticks) Object.assign(Chart.defaults.scales.linear.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.linear.title) Object.assign(Chart.defaults.scales.linear.title, scaleDefaults.title); } // Apply scale defaults to category scale if (Chart.defaults.scales && Chart.defaults.scales.category) { if (Chart.defaults.scales.category.grid) Object.assign(Chart.defaults.scales.category.grid, scaleDefaults.grid); if (Chart.defaults.scales.category.border) Object.assign(Chart.defaults.scales.category.border, scaleDefaults.border); if (Chart.defaults.scales.category.ticks) Object.assign(Chart.defaults.scales.category.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.category.title) Object.assign(Chart.defaults.scales.category.title, scaleDefaults.title); } // Apply scale defaults to logarithmic scale if (Chart.defaults.scales && Chart.defaults.scales.logarithmic) { if (Chart.defaults.scales.logarithmic.grid) Object.assign(Chart.defaults.scales.logarithmic.grid, scaleDefaults.grid); if (Chart.defaults.scales.logarithmic.border) Object.assign(Chart.defaults.scales.logarithmic.border, scaleDefaults.border); if (Chart.defaults.scales.logarithmic.ticks) Object.assign(Chart.defaults.scales.logarithmic.ticks, scaleDefaults.ticks); if (Chart.defaults.scales.logarithmic.title) Object.assign(Chart.defaults.scales.logarithmic.title, scaleDefaults.title); } // Apply scale defaults to radial scale (for radar charts) if (Chart.defaults.scales && Chart.defaults.scales.radialLinear) { if (Chart.defaults.scales.radialLinear.grid) Chart.defaults.scales.radialLinear.grid.color = palette.gridLight; if (Chart.defaults.scales.radialLinear.angleLines) Chart.defaults.scales.radialLinear.angleLines.color = palette.gridMedium; if (Chart.defaults.scales.radialLinear.pointLabels) { Chart.defaults.scales.radialLinear.pointLabels.font = { family: palette.fontFamily, size: FONT_CONFIG.sizes.axisTicks, }; Chart.defaults.scales.radialLinear.pointLabels.color = palette.textPrimary; } } return true; } // ========================================================================== // CHART WRAPPER FOR AUTO-STYLING // ========================================================================== /** * Wrap the Chart constructor to automatically apply CDL styling */ function wrapChartConstructor() { if (typeof Chart === 'undefined') return; const OriginalChart = Chart; // Create a wrapper that auto-applies colors window.Chart = function(ctx, config) { // Auto-assign colors if not specified if (config && config.data) { config.data = autoAssignColors(config.data, config.type); } // Merge default options for specific chart types if (config && config.options) { config.options = applyChartTypeDefaults(config.type, config.options); } // Call original constructor return new OriginalChart(ctx, config); }; // Copy static properties and methods Object.setPrototypeOf(window.Chart, OriginalChart); Object.assign(window.Chart, OriginalChart); // Preserve the prototype chain window.Chart.prototype = OriginalChart.prototype; } /** * Apply chart-type specific defaults */ function applyChartTypeDefaults(chartType, userOptions) { const options = { ...userOptions }; switch (chartType) { case 'bar': case 'horizontalBar': // Bar chart defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; // Hide x-axis grid for cleaner look if (options.scales.x.grid === undefined) { options.scales.x.grid = { display: false }; } break; case 'line': // Line chart defaults if (!options.interaction) { options.interaction = { intersect: false, mode: 'index' }; } break; case 'pie': case 'doughnut': // Pie/doughnut defaults if (!options.plugins) options.plugins = {}; if (options.plugins.legend === undefined) { const palette = ensurePalette(); options.plugins.legend = { position: 'right', labels: { font: { family: palette.fontFamily, size: FONT_CONFIG.sizes.legend, }, color: palette.textPrimary, padding: 15, }, }; } break; case 'radar': // Radar chart defaults - keep as-is, scale defaults applied globally break; case 'scatter': case 'bubble': // Scatter/bubble defaults if (!options.scales) options.scales = {}; if (!options.scales.x) options.scales.x = {}; if (!options.scales.y) options.scales.y = {}; break; } return options; } // ========================================================================== // CONVENIENCE FUNCTIONS FOR USERS // Exposed on window.CDLChart for easy access // ========================================================================== window.CDLChart = { // Color palette access (getters to ensure lazy initialization) get colors() { return ensurePalette().chartColors; }, get palette() { return ensurePalette(); }, // Get specific color by index getColor: getColor, // Get color with transparency getColorWithAlpha: getColorWithAlpha, // Get array of colors for a specific count getColors: function(count) { ensurePalette(); const result = []; for (let i = 0; i < count; i++) { result.push(getColor(i)); } return result; }, // Font configuration fonts: FONT_CONFIG, // Quick chart creation helpers // These create minimal config that auto-applies all styling /** * Create a simple bar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ bar: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'bar', data: { labels: labels, datasets: [{ data: data }], }, options: { plugins: { legend: { display: false } }, ...options, }, }); }, /** * Create a simple line chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - X-axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ line: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'line', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, fill: ds.fill !== undefined ? ds.fill : true, })), }, options: options, }); }, /** * Create a simple pie chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ pie: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'pie', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a simple scatter chart * @param {string} canvasId - Canvas element ID * @param {Array} datasets - Array of {label, data: [{x, y}]} objects * @param {object} options - Optional overrides */ scatter: function(canvasId, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'scatter', data: { datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, /** * Create a doughnut chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Slice labels * @param {number[]} data - Data values * @param {object} options - Optional overrides */ doughnut: function(canvasId, labels, data, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'doughnut', data: { labels: labels, datasets: [{ data: data }], }, options: options, }); }, /** * Create a radar chart * @param {string} canvasId - Canvas element ID * @param {string[]} labels - Axis labels * @param {Array} datasets - Array of {label, data} objects * @param {object} options - Optional overrides */ radar: function(canvasId, labels, datasets, options = {}) { return new Chart(document.getElementById(canvasId), { type: 'radar', data: { labels: labels, datasets: datasets.map(ds => ({ label: ds.label, data: ds.data, })), }, options: options, }); }, }; // ========================================================================== // INITIALIZATION // ========================================================================== function initialize() { // Wait for Chart.js to be available if (typeof Chart !== 'undefined') { applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully.'); return true; } else { // Chart.js not yet loaded - wait and retry let retries = 0; const maxRetries = 50; // 5 seconds max wait const checkInterval = setInterval(function() { retries++; if (typeof Chart !== 'undefined') { clearInterval(checkInterval); applyGlobalDefaults(); wrapChartConstructor(); console.log('CDL Chart defaults applied successfully (after waiting for Chart.js).'); } else if (retries >= maxRetries) { clearInterval(checkInterval); console.warn('Chart.js not found after waiting. CDL Chart defaults not applied.'); } }, 100); return false; } } // Initialize IMMEDIATELY - this must run BEFORE any chart creation scripts // Chart.js CDN should be loaded before this script initialize(); })();
English has ~170,000 words, each inflection counts separately ("run", "runs", "running", "ran"), and compounds vary ("ice cream", "icecream", "ice-cream")
OOV for new terms ("COVID-19", "selfie"), rare words ("supercalifragilisticexpialidocious"), and typos ("teh")
Chinese: 没有空格 | Japanese: 日本語も同様
Word-level vocabulary keeps growing! Subword vocabulary stabilizes at a reasonable size.
1# Simple word-level vocabulary
2vocab = {"hello", "world", "the", "cat", "sat"}
3
4def tokenize_word_level(text, vocab):
5 tokens = text.lower().split()
6 result = []
7 for token in tokens:
8 if token in vocab:
9 result.append(token)
10 else:
11 result.append("<UNK>") # Unknown token
12 return result
13
14# Example
15text = "Hello! The cat jumped"
16tokens = tokenize_word_level(text, vocab)
17print(tokens)
18# Output: ['hello', 'the', 'cat', '<UNK>']
19# ^^^ "jumped" is unknown!
We lose information! "jumped" becomes meaningless <UNK>
Examples:
Benefits:
Learn common character sequences from data! Works for any language.
1# Word-level vocabulary (only words seen in training)
2vocab = {"the", "cat", "sat", "on", "mat", "dog", "ran"}
3
4# Trying to tokenize a new sentence:
5sentence = "The supercalifragilisticexpialidocious cat meowed"
6# Word-level result: ["<UNK>", "<UNK>", "cat", "<UNK>"]
7# We lost almost everything!
The subword solution:
1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("gpt2")
3
4# Same difficult sentence:
5tokens = tokenizer.tokenize("supercalifragilisticexpialidocious")
6print(tokens)
7# ['super', 'cal', 'if', 'rag', 'il', 'istic', 'exp', 'ial', 'id', 'ocious']
8
9# Every word can be tokenized! No information loss.
10# Model can learn that 'super-' often means "very/above"
Subwords preserve meaning even for novel words!
More tokens = longer sequences, but smaller vocabulary and no OOV!
Sennrich, Haddow, & Birch (2016, ACL): Neural Machine Translation of Rare Words with Subword Units
Iteratively merge the most frequent pair of characters/subwords.
Training data: "low low low lower lower newest newest newest newest widest"
Initial tokens (characters): l, o, w, e, r, n, s, t, i, d
Most frequent pair = e, s (appears 4 times)
esl, o, w, e, r, n, s, t, i, d, esMost frequent = es, t
estl, o, w, e, r, n, s, t, i, d, es, estMost frequent = l, o
loContinue until reaching target vocabulary size (e.g., 30,000)...
1Training corpus: "low" (x3), "lower" (x2), "newest" (x4), "widest" (x1)
2
3After training, vocabulary includes:
4- Characters: l, o, w, e, r, n, s, t, i, d
5- Merges learned: es → est → lo → low → er → ...
6
7Tokenizing "lowest":
8 Step 1: Split into characters → [l, o, w, e, s, t]
9 Step 2: Apply merge rules in order learned:
10 [l, o, w, e, s, t]
11 → [lo, w, e, s, t] (merge l+o)
12 → [low, e, s, t] (merge lo+w)
13 → [low, es, t] (merge e+s)
14 → [low, est] (merge es+t)
15
16 Final tokens: ["low", "est"]
Result: "lowest" → ["low", "est"] (2 tokens instead of 6 characters!)
"lowest" → [lo, w, est] — captures common patterns without explicit linguistic rules!
1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("gpt2") # GPT-2 uses BPE
3
4text = "I'm learning about tokenization!"
5tokens = tokenizer.tokenize(text)
6print("Tokens:", tokens)
7# ['I', "'m", 'Ġlearning', 'Ġabout', 'Ġtoken', 'ization', '!']
8# Note: 'Ġ' represents space
9
10token_ids = tokenizer.encode(text) # Get token IDs
11print("Token IDs:", token_ids) # [40, 1101, 4673, 546, 11241, 1634, 0]
12
13decoded = tokenizer.decode(token_ids) # Decode back
14print("Decoded:", decoded) # "I'm learning about tokenization!"
Use the Tokenization Explorer Demo to experiment with different tokenizers interactively.
Wu et al. (2016, arXiv): Google's Neural Machine Translation System
Instead of merging most frequent pair, merge pair that maximizes likelihood:
## prefix for continuation ("playing" → ["play", "##ing"])
1from transformers import AutoTokenizer
2
3# BERT uses WordPiece
4tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
5
6text = "I'm learning about tokenization!"
7
8tokens = tokenizer.tokenize(text)
9print("Tokens:", tokens)
10# Output: ['i', "'", 'm', 'learning', 'about', 'token', '##ization', '!']
11# ^^^ Note the ##
12
13# Compare with longer word
14text2 = "unhappiness"
15tokens2 = tokenizer.tokenize(text2)
16print("Tokens:", tokens2)
17# Output: ['un', '##hap', '##pin', '##ess']
18# Breaks into morphological components!
BERT lowercases by default (unless using cased model)
Kudo & Richardson (2018, EMNLP): SentencePiece: A simple and language independent subword tokenizer
Problem with BPE/WordPiece:
SentencePiece solution:
▁)Used by: T5, ALBERT, XLNet, mT5 (multilingual models)
1from transformers import AutoTokenizer
2
3# T5 uses SentencePiece
4tokenizer = AutoTokenizer.from_pretrained("t5-small")
5
6text = "I'm learning about tokenization!"
7
8tokens = tokenizer.tokenize(text)
9print("Tokens:", tokens)
10# Output: ['I', "'", 'm', 'learning', 'about', 'token', 'ization', '!']
11# ^^^ Note the for spaces
12
13# Works seamlessly with other languages!
14text_chinese = ""
15tokens_cn = tokenizer.tokenize(text_chinese)
16print("Chinese:", tokens_cn)
17# Breaks into characters/subwords without needing pre-tokenization
No language-specific preprocessing required!
| Method | Approach | Used By | Key Feature |
|---|---|---|---|
| BPE | Frequency merging | GPT-2, RoBERTa | Bottom-up |
| WordPiece | Likelihood merging | BERT, DistilBERT | Principled scoring |
| SentencePiece (Unigram) | Top-down pruning | T5, mT5 | Multilingual |
| SentencePiece (BPE) | Bottom-up | mBART | Works with raw text |
1from transformers import AutoTokenizer
2text = "The unhappiest researchers couldn't preprocess data!"
3
4models = {"GPT-2 (BPE)": "gpt2", "BERT (WordPiece)": "bert-base-uncased",
5 "T5 (SentencePiece)": "t5-small"}
6
7for name, model_name in models.items():
8 tokenizer = AutoTokenizer.from_pretrained(model_name)
9 tokens = tokenizer.tokenize(text)
10 print(f"{name}: Tokens ({len(tokens)}): {tokens}")
11
12# Observe: Different handling of "unhappiest", spaces, and token counts!
Input: "The unhappiest researchers couldn't preprocess data!"
| Tokenizer | Tokens | Count |
|---|---|---|
| GPT-2 (BPE) | ['The', 'Ġun', 'happ', 'iest', 'Ġresearchers', 'Ġcouldn', "'t", 'Ġpre', 'process', 'Ġdata', '!'] |
11 |
| BERT (WordPiece) | ['the', 'un', '##hap', '##pie', '##st', 'researchers', 'couldn', "'", 't', 'pre', '##process', 'data', '!'] |
13 |
| T5 (SentencePiece) | ['The', 'un', 'happiest', 'researchers', 'couldn', "'", 't', 'pre', 'process', 'data', '!'] |
11 |
Ġ (GPT-2) and ▁ (T5) mark word starts (spaces)## (BERT) marks continuation subwords| Token | Purpose | Used By |
|---|---|---|
| [CLS] | Classification token (start) | BERT |
| [SEP] | Separator between segments | BERT |
| [PAD] | Padding to same length | Most models |
| [UNK] | Unknown/rare tokens | Most models |
| [MASK] | Masked token for training | BERT |
| <s>, </s> | Start/end of sequence | GPT-2, T5 |
| <|endoftext|> | Document boundary | GPT-2 |
[CLS] The cat sat [SEP] The dog ran [SEP]
1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
3
4text = "Hello world"
5encoded = tokenizer(text, return_tensors="pt")
6print("Input IDs:", encoded['input_ids']) # [101, 7592, 2088, 102]
7# ^^^ [CLS] and [SEP] added automatically!
8
9full_decode = tokenizer.decode(encoded['input_ids'][0])
10print("With special:", full_decode) # "[CLS] hello world [SEP]"
11
12clean_decode = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)
13print("Without special:", clean_decode) # "hello world"
Smaller vocabulary (e.g., 10k tokens):
Larger vocabulary (e.g., 100k tokens):
30k-50k tokens for most models: GPT-2: 50k | BERT: 30k | T5: 32k
bert-base-uncased lowercases everything; bert-base-cased preserves case
1from transformers import AutoTokenizer
2tokenizer = AutoTokenizer.from_pretrained("gpt2")
3text = "The antidisestablishmentarianism debate continues!"
4
5tokens = tokenizer.tokenize(text)
6token_ids = tokenizer.encode(text)
7print(f"Tokens ({len(tokens)}): {tokens}\n")
8
9for i, (token, token_id) in enumerate(zip(tokens, token_ids)):
10 decoded = tokenizer.decode([token_id])
11 print(f"{i:2d}. ID {token_id:5d} | Token: {token:20s} | Decoded: {decoded}")
12
13print(f"\nVocabulary size: {tokenizer.vocab_size}")
14print(f"Special tokens: {tokenizer.all_special_tokens}")
Saffran, Aslin, & Newport (1996, Science): Statistical learning by 8-month-old infants
Infant learning:
Parallel with subword tokenization:
Infants: Online, real-time, multimodal (sound + context) | BPE: Batch, text-only, no grounding
Infant Learning:
Input stream: bidakupadotigolabu...
Learn high-probability sequences:
bi-da-ku (word)pa-do-ti (word)Detect low-probability boundaries:
ku | pa (boundary)BPE Learning:
Input corpus: low low lower...
Merge high-frequency pairs:
l+o → lolo+w → lowBuild vocabulary:
low, lower, lowestBoth use distributional statistics to discover structure!
Use the Tokenization Explorer Demo to compare tokenizers interactively!
Linguistics vs. Statistics: BPE discovers morphemes (un-, -ing, -ness) without linguistic rules. Is this "learning" morphology, or just pattern matching?
Cross-lingual tokenization: Should we use the same tokenizer for all languages? What are the trade-offs?
Semantic preservation: Does breaking "unhappy" into ["un", "happy"] preserve meaning? What about "butterfly"?
Human vs. machine: Humans don't consciously tokenize words. Why do machines need to?
Future directions: Will we move toward character-level or byte-level models that don't need tokenization?
How tokenization connects:
Lecture 7 — X-Hour: Text Classification Workshop