Bigram
Bigram
Bigram
A predicting system only consider about two adjacent characters
Two implementation methods
- Basical statistic
- Single layer of neurons
Code
Pre
1
2
3
4
5
6
7
8
9
10
words = open('names.txt','r').read().splitlines()
b = {}
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs,chs[1:]):
bigram = (ch1, ch2)
# Count identical adjacent characters
# if bigram not existing in b, set 0
b[bigram] = b.get(bigram, 0) + 1
sorted(b.items(), key = lambda kv: -kv[1])
Basical statistic
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import torch
import string
N = torch.zeros((27,27),dtype=torch.int32)
chars = list(string.ascii_lowercase)
stoi = {s:i+1 for i, s in enumerate(chars)}
itos = {i:s for i,s in enumerate(chars)}
stoi['.'] = 0
itos[0] = '.'
for w in words:
chs = ['.' ] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
idx1 = stoi[ch1]
idx2 = stoi[ch2]
# accumlating the frequency of each bigram
N[idx1,idx2] += 1
# To against the problem of zero probability
# trans to flaot to normilize it
P = (N+1).float()
# P.sum(1, keepdim = True) -> [27 X 1]
P /= P.sum(1, keepdim = True)
# P -> [27 X 27]
g = torch.Generator().manual_seed(2147483647)
for i in range(5):
out = []
ix = 0
while True:
p = P[ix]
# take one number each time and not pop it out
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))
One-layer Neural Networks
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import torch
import torch.nn.functional as F
# xs stores the index of the first character
# ys stores the index of the second character
xs = []
ys = []
g = torch.Generator().manual_seed(2147483647)
for w in words:
chs = ['.'] + list(w) + ['.']
for ch1, ch2 in zip(chs, chs[1:]):
idx1 = stoi[ch1]
idx2 = stoi[ch2]
xs.append(idx1)
ys.append(idx2)
xs = torch.tensor(xs)
ys = torch.tensor(ys)
xidx = F.one_hot(torch.tensor(xs), num_classes=27).float()
# ----------- Front Pass -----------
# 【27,27】 mapping to all posssibities mapping character's weights
W = torch.randn(27,27, generator=g, requires_grad=True)
for i in range(100):
logits = xidx @ W
# make sure all the values are positive]
counts = logits.exp()
# normalize the counts to get probabilities ( sum to 1 )
# keepdim -> keep the one dimension
# column direction -> counts.sum(1, keepdim = True) -> [lens(xs), 1]
probs = counts / counts.sum(1, keepdim=True)
loss = -probs[torch.arange(len(xs)), ys].log().mean() + 0.01*(W**2).mean()
loss
# ----------- Back Pass -----------
# backward pass
W.grad = None # set to zero the gradient
loss.backward()
# update
with torch.no_grad():
W.data += -141* W.grad
print(f'{i}: {loss.item()}')
out = []
ix = 0
while True:
ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
out.append(itos[ix])
if ix == 0:
break
print(''.join(out))
Note
Method-1
stoi
-> character to int,itos
vice versaP.sum(1, keepdim = True)
means Each row is a sample, and the sum of each sample is
Methond-2
0.01*(W**2).mean()
L2 Regularization- effectively shrinks the weights toward zero
- -> Smoother probability distribution: By keeping weights moderate, get a more balanced probability distribution across possible next characters
This post is licensed under CC BY 4.0 by the author.