Bigram

Posted Mar 22, 2025

By Wei Xiong

3 min read

Bigram

A predicting system only consider about two adjacent characters

Two implementation methods

Basical statistic
Single layer of neurons

Code

Pre

  
words = open('names.txt','r').read().splitlines()
b = {}
for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs,chs[1:]):
        bigram = (ch1, ch2)
        # Count identical adjacent characters
        # if bigram not existing in b, set 0
        b[bigram] = b.get(bigram, 0) + 1
sorted(b.items(), key = lambda kv: -kv[1])

Basical statistic

  
import torch
import string

N = torch.zeros((27,27),dtype=torch.int32)
chars = list(string.ascii_lowercase)
stoi = {s:i+1 for i, s in enumerate(chars)}
itos = {i:s for i,s in enumerate(chars)}

stoi['.'] = 0
itos[0] = '.'

for w in words:
    chs = ['.' ] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        # accumlating the frequency of each bigram
        N[idx1,idx2] += 1

# To against the problem of zero probability
# trans to flaot to normilize it
P = (N+1).float()

# P.sum(1, keepdim = True) -> [27 X 1]
P /= P.sum(1, keepdim = True)
# P -> [27 X 27]

g = torch.Generator().manual_seed(2147483647)

for i in range(5):
    out = []
    ix = 0
    while True:
        p = P[ix]
        # take one number each time and not pop it out
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

One-layer Neural Networks

  
import torch
import torch.nn.functional as F

# xs stores the index of the first character
# ys stores the index of the second character
xs = []
ys = []


g = torch.Generator().manual_seed(2147483647)

for w in words:
    chs = ['.'] + list(w) + ['.']
    for ch1, ch2 in zip(chs, chs[1:]):
        idx1 = stoi[ch1]
        idx2 = stoi[ch2]
        xs.append(idx1)
        ys.append(idx2)

xs = torch.tensor(xs)
ys = torch.tensor(ys)

xidx = F.one_hot(torch.tensor(xs), num_classes=27).float()

# ----------- Front Pass -----------
# 【27,27】 mapping to all posssibities mapping character's weights 
W = torch.randn(27,27, generator=g, requires_grad=True)

for i in range(100):
    logits = xidx @ W
    # make sure all the values are positive]
    counts = logits.exp()

    # normalize the counts to get probabilities ( sum to 1 )
    # keepdim -> keep the one dimension
    # column direction -> counts.sum(1, keepdim = True) -> [lens(xs), 1]
    probs = counts / counts.sum(1, keepdim=True)

    loss = -probs[torch.arange(len(xs)), ys].log().mean() + 0.01*(W**2).mean()
    loss

    # ----------- Back Pass -----------
    # backward pass
    W.grad = None # set to zero the gradient
    loss.backward()

    # update
    with torch.no_grad():
            W.data += -141* W.grad
    print(f'{i}: {loss.item()}')

    out = []
    ix = 0
    while True:
        ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
        out.append(itos[ix])
        if ix == 0:
            break
    print(''.join(out))

Note

Method-1

stoi -> character to int, itos vice versa
P.sum(1, keepdim = True) means Each row is a sample, and the sum of each sample is

Methond-2

0.01*(W**2).mean() L2 Regularization
- effectively shrinks the weights toward zero
- -> Smoother probability distribution: By keeping weights moderate, get a more balanced probability distribution across possible next characters

Additional, LLM

This post is licensed under CC BY 4.0 by the author.

Bigram

Code

Pre

Basical statistic

One-layer Neural Networks

Note

Method-1

Methond-2

Trending Tags