BPE Step-by-Step Implementation#

In this lecture, we will walk through the implementation of Byte Pair Encoding (BPE), a popular subword tokenization method. We will use a dataset of financial news headlines for this demonstration.

Dataset Preparation#

First, we need to load our dataset. We will use the ashraq/financial-news dataset from the Hugging Face Hub. We will randomly sample 1000 records from this dataset for our demonstration.

from datasets import load_dataset

dataset = load_dataset("ashraq/financial-news")
texts = dataset["train"].shuffle(seed=1234).select(range(1000))["headline"]
/home/yjlee/.cache/pypoetry/virtualenvs/lecture-_dERj_9R-py3.8/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset parquet (/home/yjlee/.cache/huggingface/datasets/ashraq___parquet/ashraq--financial-news-89d6ac597a40e29e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 1/1 [00:00<00:00, 48.62it/s]
Loading cached shuffled indices for dataset at /home/yjlee/.cache/huggingface/datasets/ashraq___parquet/ashraq--financial-news-89d6ac597a40e29e/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-79cde8905e45a47f.arrow

BPE Implementation#

Now, let’s dive into the implementation of BPE.

Initialization#

We start by initializing our vocabulary. We will format each word by separating its characters with spaces and appending a special end-of-word token </w>.

import re, collections


def format_word(text, space_token="▁"):
    return " ".join(list(text)) + " " + space_token


def initialize_vocab(texts, lowercase=True):
    vocab = {}

    for text in texts:
        if lowercase:
            text = text.lower()
        text = re.sub(r"\s+", " ", text)
        all_words = text.split()
        for word in all_words:
            word = format_word(word)
            vocab[word] = vocab.get(word, 0) + 1
    return vocab


vocab = initialize_vocab(texts)
print(f"Number of words: {len(vocab)}")
Number of words: 3636

Token Extraction#

Next, we extract all unique tokens from our vocabulary and count their frequencies.

def get_tokens_from_vocab(vocab):
    tokens = collections.defaultdict(int)
    vocab_tokenization = {}
    for word, freq in vocab.items():
        word_tokens = word.split()
        for token in word_tokens:
            tokens[token] += freq
        vocab_tokenization["".join(word_tokens)] = word_tokens
    return tokens, vocab_tokenization


tokens, vocab_tokenization = get_tokens_from_vocab(vocab)
print(f"Number of tokens: {len(tokens)}")
Number of tokens: 64

Bigram Counts#

We then count the frequency of each bigram (pair of consecutive tokens) in our vocabulary.

def get_bigram_counts(vocab):
    pairs = {}
    for word, count in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pair = (symbols[i], symbols[i + 1])
            pairs[pair] = pairs.get(pair, 0) + count
    return pairs

Merge Operations#

The core of BPE is a series of merge operations. In each operation, we find the most frequent bigram and merge it into a single token. We repeat this process for a specified number of iterations.

def merge_vocab(pair, vocab_in):
    vocab_out = {}
    bigram = re.escape(" ".join(pair))
    p = re.compile(r"(?<!\S)" + bigram + r"(?!\S)")
    bytepair = "".join(pair)
    for word in vocab_in:
        w_out = p.sub(bytepair, word)
        vocab_out[w_out] = vocab_in[word]
    return vocab_out, (bigram, bytepair)


def find_merges(vocab, tokens, num_merges, indices_to_print=[0, 1, 2]):
    merges = []
    for i in range(num_merges):
        pairs = get_bigram_counts(vocab)
        best_pair = max(pairs, key=pairs.get)
        best_count = pairs[best_pair]

        vocab, (bigram, bytepair) = merge_vocab(best_pair, vocab)
        merges.append((r"(?<!\S)" + bigram + r"(?!\S)", bytepair))
        tokens, vocab_tokenization = get_tokens_from_vocab(vocab)
        if i in indices_to_print:
            print(f"Merge {i}: {best_pair} with count {best_count}")
            print("All tokens: {}".format(tokens.keys()))
            print("Number of tokens: {}".format(len(tokens.keys())))

    return vocab, tokens, merges, vocab_tokenization


num_merges = 1000
indices_to_print = [0, 1, 2, num_merges - 1]

vocab, tokens, merges, vocab_tokenization = find_merges(
    vocab, tokens, num_merges, indices_to_print
)
Merge 0: ('s', '▁') with count 1639
All tokens: dict_keys(['t', 'r', 'u', 'd', 'e', 'a', '▁', 'w', 'i', 'l', 'c', 'o', 'n', 'g', 'v', 'm', ',', 'h', '.', 'k', "'", 's▁', 'j', 's', 'f', 'y', ':', 'p', '(', ')', '?', 'b', ';', 'z', '2', '0', '1', '8', '&', '5', '-', 'q', '3', 'x', '4', '7', '%', '$', '9', '/', '6', '—', '…', '–', '|', '!', '#', '[', ']', '~', 'é', '+', '®', '"', '€'])
Number of tokens: 65
Merge 1: ('i', 'n') with count 1092
All tokens: dict_keys(['t', 'r', 'u', 'd', 'e', 'a', '▁', 'w', 'i', 'l', 'c', 'o', 'n', 'g', 'v', 'm', 'in', ',', 'h', '.', 'k', "'", 's▁', 'j', 's', 'f', 'y', ':', 'p', '(', ')', '?', 'b', ';', 'z', '2', '0', '1', '8', '&', '5', '-', 'q', '3', 'x', '4', '7', '%', '$', '9', '/', '6', '—', '…', '–', '|', '!', '#', '[', ']', '~', 'é', '+', '®', '"', '€'])
Number of tokens: 66
Merge 2: ('e', '▁') with count 900
All tokens: dict_keys(['t', 'r', 'u', 'd', 'e', 'a', '▁', 'w', 'i', 'l', 'c', 'o', 'n', 'g', 'v', 'm', 'in', ',', 'h', 'e▁', '.', 'k', "'", 's▁', 'j', 's', 'f', 'y', ':', 'p', '(', ')', '?', 'b', ';', 'z', '2', '0', '1', '8', '&', '5', '-', 'q', '3', 'x', '4', '7', '%', '$', '9', '/', '6', '—', '…', '–', '|', '!', '#', '[', ']', '~', 'é', '+', '®', '"', '€'])
Number of tokens: 67
Merge 999: ('cor', 'p.▁') with count 6
All tokens: dict_keys(['tru', 'de', 'au', '▁', 'will▁', 'le', 'ad▁', 'a▁', 'co', 'al', 'iti', 'on▁', 'go', 'ver', 'n', 'ment▁', 'in▁', 'canad', 'a,▁', 'wh', 'il', 'e▁', 'the▁', 'u.', 'k', '.', "'s▁", 'jo', 'h', 'son▁', 'fi', 'gh', 'ts▁', 'an', 'o', 'ther▁', 'day▁', 'week▁', 'ahe', 'ad', ':▁', 'fo', 'm', 'c▁', 'me', 'et', 'ing', ',▁', 'g', 'd', 'p▁', 'and▁', 'more▁', 'earnings▁', 'is▁', 'l', 'um', 'in', '(', 'n)▁', 'great▁', 'growth▁', 'stock', '?▁', 'hu', 'ge▁', 'min', 'er▁', 'bank', 'r', 'up', 't', 'ci', 'es▁', 'po', 's', 'si', 'ble▁', 'so', 'on', ';▁', 'news▁', 'for▁', 'gold▁', 'sil', 'b', 'stre', 'am', 'ing▁', 'compani', 'e', 's?▁', 'ni', 'z', 'ant▁', 'en', 'th', 's▁', 'buy', 'back▁', 'pro', 'gr', 'am▁', 'ite▁', 'hou', 'se▁', 'sing▁', 'cu', 'op', 'i', 'id▁', 'produc', 'tion▁', 'global▁', 'bond▁', 'markets▁', 'to▁', 'ter▁', 'new▁', 'ph', 'ase▁', '2018▁', 'mar', 'sh▁', '&▁', 'cl', 'an▁', '(m', 'c)▁', 'hits▁', '5', '2', '-week▁', 'high▁', 'li', 'd▁', 'q3▁', 'results▁', 'tracking▁', 'pre', 'm▁', 'wat', "a's▁", 'fa', 'ir', 'x▁', 'financial▁', 'holdings▁', 'portfolio▁', '-▁', 'q4▁', '2017▁', 'update▁', 'bi', 'corporation▁', 'q2▁', 'call▁', 'slides▁', 'ver▁', 'ay▁', 'old▁', 'tri', 'c', 'cap', 'it', 'up▁', '20', '%▁', 'after▁', 'our', 'retail▁', 'sales▁', 'y▁', 'gain▁', 'stocks▁', 'fall▁', 'ahead▁', 'of▁', 'f', 'ed▁', 'gn', 'inc.▁', 'ters▁', 'over', 'ter', 'or', 'tale▁', 'tape▁', 'tu', 'ck', 'retirement▁', 'systems▁', 'insur', 'ance▁', 'trust▁', 'fund▁', 'buys▁', 'w', 't▁', 'dis', 'ne', 'he', 'ss▁', 'corp,▁', 'first▁', '...▁', 'ar', 'ry▁', 'ro', "s'▁", 'view▁', 'capital▁', 'management▁', '2015▁', 'tran', 'sc', 'ra', '$', 'b▁', 'ex', 'pan', 'sion▁', 'pl', 'no', 'v', 'gas▁', 'syste', 'ares▁', 'sol', 'ar▁', 'partn', 'er', 'shi', 'invest▁', 're', 'siden', 'tial▁', 'asse', 'by▁', 'ber', 's:▁', 'technology▁', 'with▁', 'rising▁', 'expect', 'ations▁', 'canadian▁', 'natural▁', 'resources▁', 'declares▁', 'ca', '0.', '3', '7', '5▁', 'dividend▁', 'vetr▁', 'community▁', 'has▁', 'downgraded▁', 'u', '-stars▁', 'im', 'mun', 'medic', 'high', 'ly▁', 'invest', 'p', 'du', 'inter', 'nation', '201', 'set▁', 'cor', 's,▁', 'fu', 'll▁', 'upg', 'raded▁', 'sis▁', '-star', 's.▁', 'higher▁', 'mb', 'p,▁', 'revenues▁', 'a', ')▁', 'earning', 'u.s.▁', 'back', 'bro', 'nor', 'th▁', 'anc', 'tions▁', 'how▁', 'rea', 'ch', '1', 'cr', 'amer', 'ma', 'mone', '9', '/', '4', '/1', '8', 'st', 'one▁', 'llc▁', 'point▁', 'se', 'ct▁', 'banc', 'inc,▁', 'at', 'y', 'k▁', 'ic', 'ip', 'al▁', 'income▁', '$0.0', '3▁', 'la', 'ru', 'sh', 'earnings:▁', 'why▁', 'as', "'t▁", 'stock▁', 'follow', 'su', 'wi', 'ceo▁', 'el▁', 'l▁', 'jon', 'sells▁', '10', ',', '0', '00▁', 'shares▁', 'top▁', 'ranked▁', 'mo', 'men', 'buy▁', 'march▁', 'ation▁', 'alty▁', 'ed', 'ran', 'dy▁', 'ur', 'che', '2016▁', 'transcript▁', 'mon', 'z▁', 'international▁', 'estimates▁', 'at▁', 'its▁', 'next▁', 'report', 'her', 'age▁', 'com', 'mer', 'ce▁', 'beats▁', '1,▁', 'revenue▁', 'in-', 'line▁', 'ag', 'ri', 'tur', 'market▁', 'report▁', 'thur', 'da', 'y,▁', '.▁', '6▁', 'ke', 'w▁', 'hold', '.5%▁', 'siti', 'portfolio', 'ol', 'ging▁', 'china▁', 'merger▁', 'ors▁', 'contin', 'ue▁', 'ry', 'sl', 'group▁', '1▁', 'auto▁', 'acqui', 'res▁', 'for', 'ers▁', 'analyst▁', 'blog▁', 'lif', 'est', 'ta', 'kes▁', 'ac', 'ate▁', 'investor▁', 'con', 'den', 'ous▁', '9▁', 'ali', 'o▁', 'continues▁', 'ti', 've▁', 'ff', 'st▁', 'cont', "'", 'ey', "'▁", 'goo', 'le▁', 'sho', 'cks▁', 'sch', 'ste', 'reit▁', 'sen', 'trad', 'n▁', 'pharma▁', 'ata▁', 'gainers▁', 'financ', "e's▁", '(h', '2019▁', 'all', 'scrip', 'el', 'ps▁', 'ho', 'sp', 'ital▁', 'chi', 'ear', 'lo', 'ws▁', 'mobil', 'vi', 'en▁', 'out▁', 'merg', 'strate', 'gy▁', 'quar', 'commentary▁', 'bank▁', 'les▁', 'be▁', 'arm', 'marke', 'economi', 'value▁', 'company▁', 'home▁', 'ak▁', 'ak', 's)▁', 'por', 'wor', 'ks▁', 'int', 'industri', 'strong▁', 'data▁', 'ds▁', 'cen', 'us▁', 'energy▁', 've', 'als▁', 'rise▁', '4▁', 'oil▁', 'out', 'pu', 'br', 'pac', 'consumer▁', 'di', 'tion', 'ary▁', 'pe', 'this▁', 'hn▁', 'bu', 'health▁', 'care▁', 'be', '-', '2▁', 'or▁', 'mor', ')', 'moving▁', 'average▁', 'ros', 'pot', 'stru', 'ments▁', 'beat', 'miss▁', 'should▁', 'investors▁', 'their▁', 'to', 'ward▁', 'ct', 'x)▁', 'time▁', 'star', 'heal', 'the', 'qu', 'ity▁', 'announ', 'quarter▁', 'year▁', '—…–', 'j', 'ab', 'il▁', 'it▁', 'ear▁', 'key▁', 'sup', 'je', 'x', 'off', 'er,▁', 'but▁', 'rema', 'ins▁', 'open▁', 'per', 'foo', 'te', 'sts▁', "d's▁", "y's▁", 'equ', 'misses▁', 'medi', 'wing▁', 'strength▁', 'among▁', 'energ', 'y/', 'mat', 'eri', 'gain', 's;▁', 'um▁', 'yiel', '10▁', 'bio', 'sci', 'ence▁', 'losers▁', 'mi', 'fil', 'da▁', 'ine▁', 'can', 'big▁', 'cap▁', 'fit', 'pi', 'july▁', '6', 'impro', 'fol', 'low▁', 'bl', 'ack', 'ck▁', 'into▁', 'inve', 'january▁', 'vo', 'oun', 'announces▁', 'earn', 'fun', 'ding▁', 'd-', '–', 'f▁', 'we', 'van', 'gu', 'ard▁', 's&p▁', '…', 'etf', 'ishares▁', 'are▁', '&', 't,▁', 'national▁', 'car', 'ate', 'etf▁', 'ba', 'sed▁', 'llion▁', 'ping▁', 'seas', 'close▁', 'xed▁', 'despite▁', 'reta', 'sm', 'g▁', 'products▁', 'inc▁', 'p)▁', 'ce', 'mber▁', 'do▁', 'sal', 'es,▁', 'y:▁', 'gen', 'move▁', 'ire▁', 'ould▁', 'fir', 'fe', 'your▁', 'from▁', 'ates▁', 'down▁', '7▁', 'par', 'ann', 'dividen', 'd,▁', 'valu', 'part▁', 'ap', 'vol', 'analy', 'e,▁', 'ren', 'lead', 'american▁', 'earns▁', 'rating▁', 'update', 'ati', 'dig', 'deal▁', 'loss▁', 'than▁', 'reti', 'un', 'res', 'ves▁', 'co▁', 'ju', 'ating▁', 'brea', 'ven', 'prices▁', 'met', 'vie', 'q', 'outlook▁', 'ha', 'tic', 'upgrad', 'ng', 'sur', 'ges▁', 'as▁', 'beat▁', 'eu', 'v▁', 'asset▁', 'technologies▁', 'come▁', 'notable▁', "day's▁", 'man', 'now▁', 'lu', 'manage', 'ment', 'mit', 'contrac', 'ss', 'rate▁', 'get', 'ting▁', 'los', 'tech', 'bar', 'str', 'inde', 'ssion▁', 'zacks▁', 'highlights:▁', 'char', 'commun', 'ation', 'sou', 'ther', 'ch▁', 'ic▁', '—…', '|', 'group', 'c,▁', '.5', '$0.', 'indi', 'ces▁', 'king▁', 'me▁', '0▁', 'pen', 'grow', '(f', 'wee', 'q1▁', 'dow▁', 'bo', 'zack', 's.com▁', 'red▁', 'highligh', 'inc', '?', 'i▁', 'dup', 'de▁', 'win', 'dow', 'shor', 'ke▁', '!', 'gic▁', 'expected▁', 'what▁', 'week', 'rele', 'decl', 'month', 'distri', 'relative▁', 'trade▁', 'ears▁', 'goes▁', 'ex-dividend▁', 'book▁', 'cha', 'stor', 'tr', 'oo', 'now?▁', 'ari', 'pay', 'ou', 'mic', 'port', 'ities▁', 'bon', 'fac', 'lay', 'uni', 'servi', 'industry▁', '—', 'war', 'trading▁', '&p▁', '#', 'chin', 'ound▁', 'equity▁', 'ral', 'wall▁', 'street▁', 'break', 'rat', 'super', 'pharm', '4%▁', 'ince▁', 'last▁', 'ly', 'high-yield▁', 'll', '5.', '3%▁', 'partners▁', 'mu', '[', 'view:▁', ']', 'financi', 'ty▁', 'mist', 'yield▁', 'ay', 'winn', 'gains▁', 'august▁', 'jum', 'you▁', 'industr', 'can▁', 'pri', 'bil', 'get▁', 'inf', 'struct', 'ure▁', 'fer', 'trac', 'gener', 'hi', 'incre', 'ases▁', 'reports▁', 'operating▁', '(10-q)▁', 'ght▁', 'diam', 'rough▁', 'ban', 'bet', 't-', 'tting▁', 'before▁', "thursday's▁", 'sta', 'estim', 'verage▁', 'look▁', 'investment▁', 'loo', 'ons▁', 'wedne', "sday's▁", 'best▁', 'way▁', 'ger▁', 'all▁', 'spe', 'sector▁', 'ble', 'compan', 'atur', 'ir▁', 'erg', 'vis', 'ine', 'gold', 'pa', 'cut▁', 'net', 'ctor▁', 'resul', 'ts:▁', 'tech▁', 'x,▁', 'ill▁', '~', 'g)▁', 'my▁', 'sum', 'man▁', 'fact', 'low', 'divi', 'tap', '8▁', 'ib', 'boo', 'debt▁', 'tg', 'gi', 'corp.▁', 't)▁', 'sli', 'ving▁', 'poin', 'ga', 'rai', 'gui', 'small▁', 'etfs▁', 'upd', 'et▁', 'ld', 'investing▁', 'har', 'sharehold', 'weekly▁', 'insi', 'highli', 'net▁', 'funds▁', 'neutral▁', 'mid', 'these▁', 'util', 'view', 'ge', 'com▁', 'sto', 'shar', 'ris', 'roun', 'ings▁', 'americ', 'lli', 'econo', '%', 'eps▁', 'right▁', 'do', 'expe', 'ff▁', 'é', 'aly', 'ong▁', 'h▁', 'michael▁', 'cy', ':', 'gre', 'hal', 'now', '+', 'qui', 'ative▁', 'ms▁', 'tn', 'uti', 'af', 'cono', 'under', 'rade', 'clo', 'end▁', 'reven', 'p.▁', 'sy', 'oper', 'streng', 'tw', 'fin', 'ree▁', 'nam', 'dri', 'val', 'corpor', 'age', 'tal', 'jan', 'ser', 'sof', 'ked▁', 'ld▁', 'resour', 'des▁', 'produ', 'der', 'oc', 'ber▁', 'rip', 'form', 'plan▁', 'track', 'miss', 'q)▁', 'nex', 'm,▁', 'neutr', 'perform', 'resu', 'yi', 'old', 'lls▁', 'technologi', '®', 'rom▁', 'indu', 'rou', 'you', 'gh▁', 'our▁', 'grad', 'blo', 'high-', 'stri', 'consum', 'rel', '"', '€'])
Number of tokens: 1031

Encoding and Decoding#

Decoding#

Decoding is straightforward. We simply concatenate all the tokens together and remove the stop token </w>. For example, if the encoded sequence is [the</w>, high, est</w>, moun, tain</w>], the decoded sequence is the highest mountain.

Encoding#

Encoding is a bit more complex. For a given sentence, we need to find the longest token in our vocabulary that is a subword of each word in the sentence. If no such token exists, we replace the word with an unknown token </u>. This process is computationally expensive.

def measure_token_length(token, space_token="▁"):
    space_token_len = len(space_token)
    if token[-space_token_len:] == space_token:
        return len(token) - space_token_len + 1
    else:
        return len(token)


def encode_word(string, sorted_tokens, unknown_token="</u>"):
    if string == "":
        return []
    sorted_tokens = sorted_tokens.copy()
    if sorted_tokens == []:
        return [unknown_token]

    string_tokens = []
    for i in range(len(sorted_tokens)):
        token = sorted_tokens[i]
        token_reg = re.escape(token.replace(".", "[.]"))

        matched_positions = [
            (m.start(0), m.end(0)) for m in re.finditer(token_reg, string)
        ]
        if len(matched_positions) == 0:
            continue
        substring_end_positions = [
            matched_position[0] for matched_position in matched_positions
        ]

        substring_start_position = 0
        for substring_end_position in substring_end_positions:
            substring = string[substring_start_position:substring_end_position]
            string_tokens += encode_word(
                string=substring,
                sorted_tokens=sorted_tokens[i + 1 :],
                unknown_token=unknown_token,
            )
            string_tokens += [token]
            substring_start_position = substring_end_position + len(token)
        remaining_substring = string[substring_start_position:]
        string_tokens += encode_word(
            string=remaining_substring,
            sorted_tokens=sorted_tokens[i + 1 :],
            unknown_token=unknown_token,
        )
        break
    return string_tokens

We can now use this function to encode a given word.

def print_tokenization(word_given, sorted_tokens, vocab_tokenization):
    print("Tokenizing word: {}...".format(word_given))
    if word_given in vocab_tokenization:
        print("Tokenization of the known word:")
        print(vocab_tokenization[word_given])
        print("Tokenization treating the known word as unknown:")
        print(
            encode_word(
                string=word_given, sorted_tokens=sorted_tokens, unknown_token="</u>"
            )
        )
    else:
        print("Tokenizating of the unknown word:")
        print(
            encode_word(
                string=word_given, sorted_tokens=sorted_tokens, unknown_token="</u>"
            )
        )
# Sort tokens by length in descending order
sorted_tokens = sorted(tokens.keys(), key=len, reverse=True)

word_given_known = "investors▁"

print_tokenization(word_given_known, sorted_tokens, vocab_tokenization)

word_given_unknown = "dogecoin▁"

print_tokenization(word_given_unknown, sorted_tokens, vocab_tokenization)
Tokenizing word: investors▁...
Tokenization of the known word:
['investors▁']
Tokenization treating the known word as unknown:
['investors▁']
Tokenizing word: dogecoin▁...
Tokenizating of the unknown word:
['do', 'ge', 'co', 'in▁']

Finally, we can use our encoding function to tokenize an entire sentence.

def tokenize(text, space_token="▁"):
    text = re.sub("\s+", " ", text.lower())
    words = [word + space_token for word in text.split(" ")]
    encoded_words = [
        encode_word(word, sorted_tokens, unknown_token="</u>") for word in words
    ]
    return sum(encoded_words, [])


tokenized_text = tokenize("Investment opportunities in the company")
print(tokenized_text)
['investment▁', 'op', 'port', 'un', 'ities▁', 'in▁', 'the▁', 'company▁']

That’s it! You have now implemented BPE from scratch. This should give you a good understanding of how subword tokenization works in practice.