import torch
import torch.nn as nn

# Import pprint, module we use for making our print statements prettier
import pprint
pp = pprint.PrettyPrinter()

list_of_lists = [
  [1, 2, 3],
  [4, 5, 6],
]
print(list_of_lists)

[[1, 2, 3], [4, 5, 6]]

# Initializing a tensor
data = torch.tensor([
                     [0, 1],
                     [2, 3],
                     [4, 5]
                    ])
print(data)

tensor([[0, 1],
        [2, 3],
        [4, 5]])

# Initializing a tensor with an explicit data type
# Notice the dots after the numbers, which specify that they're floats
data = torch.tensor([
                     [0, 1],
                     [2, 3],
                     [4, 5]
                    ], dtype=torch.float32)
print(data)

tensor([[0., 1.],
        [2., 3.],
        [4., 5.]])

# Initializing a tensor with an explicit data type
# Notice the dots after the numbers, which specify that they're floats
data = torch.tensor([
                     [0.11111111, 1],
                     [2, 3],
                     [4, 5]
                    ], dtype=torch.float32)
print(data)

tensor([[0.1111, 1.0000],
        [2.0000, 3.0000],
        [4.0000, 5.0000]])

# Initializing a tensor with an explicit data type
# Notice the dots after the numbers, which specify that they're floats
data = torch.tensor([
                     [0.11111111, 1],
                     [2, 3],
                     [4, 5]
                    ])
print(data)

tensor([[0.1111, 1.0000],
        [2.0000, 3.0000],
        [4.0000, 5.0000]])

zeros = torch.zeros(2, 5)  # a tensor of all zeros
print(zeros)

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

ones = torch.ones(3, 4)   # a tensor of all ones
print(ones)

tensor([[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]])

rr = torch.arange(1, 10) # range from [1, 10)
print(rr)

tensor([1, 2, 3, 4, 5, 6, 7, 8, 9])

rr + 2

tensor([ 3,  4,  5,  6,  7,  8,  9, 10, 11])

rr * 2

tensor([ 2,  4,  6,  8, 10, 12, 14, 16, 18])

a = torch.tensor([[1, 2], [2, 3], [4, 5]])      # (3, 2)
b = torch.tensor([[1, 2, 3, 4], [5, 6, 7, 8]])  # (2, 4)  (3, 4)

print("A is", a)
print("B is", b)
print("The product is", a.matmul(b))
print("The other product is", a @ b) # +, -, *, @

A is tensor([[1, 2],
        [2, 3],
        [4, 5]])
B is tensor([[1, 2, 3, 4],
        [5, 6, 7, 8]])
The product is tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])
The other product is tensor([[11, 14, 17, 20],
        [17, 22, 27, 32],
        [29, 38, 47, 56]])

v = torch.tensor([1, 2, 3])

v.shape

torch.Size([3])

torch.tensor([[1, 2, 3], [4, 5, 6]]) @ v  #(2, 3) @ (3)  -> (2)

tensor([14, 32])

matr_2d = torch.tensor([[1, 2, 3], [4, 5, 6]])
print(matr_2d.shape)
print(matr_2d)

torch.Size([2, 3])
tensor([[1, 2, 3],
        [4, 5, 6]])

matr_3d = torch.tensor([[[1, 2, 3, 4], [-2, 5, 6, 9]], [[5, 6, 7, 2], [8, 9, 10, 4]], [[-3, 2, 2, 1], [4, 6, 5, 9]]])
print(matr_3d)
print(matr_3d.shape)

tensor([[[ 1,  2,  3,  4],
         [-2,  5,  6,  9]],

        [[ 5,  6,  7,  2],
         [ 8,  9, 10,  4]],

        [[-3,  2,  2,  1],
         [ 4,  6,  5,  9]]])
torch.Size([3, 2, 4])

rr = torch.arange(1, 16)
print("The shape is currently", rr.shape)
print("The contents are currently", rr)
print()
rr = rr.view(5, 3)
print("After reshaping, the shape is currently", rr.shape)
print("The contents are currently", rr)

The shape is currently torch.Size([15])
The contents are currently tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15])

After reshaping, the shape is currently torch.Size([5, 3])
The contents are currently tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15]])

import numpy as np

# numpy.ndarray --> torch.Tensor:
arr = np.array([[1, 0, 5]])
data = torch.tensor(arr)
print("This is a torch.tensor", data)

# torch.Tensor --> numpy.ndarray:
new_arr = data.numpy()
print("This is a np.ndarray", new_arr)

This is a torch.tensor tensor([[1, 0, 5]], dtype=torch.int32)
This is a np.ndarray [[1 0 5]]

data = torch.arange(1, 36, dtype=torch.float32).reshape(5, 7)
print("Data is:", data)

# We can perform operations like *sum* over each row...
print("Taking the sum over columns:")
print(data.sum(dim=0))

# or over each column.
print("Taking thep sum over rows:")
print(data.sum(dim=1))

# Other operations are available:
print("Taking the stdev over rows:")
print(data.std(dim=1))

Data is: tensor([[ 1.,  2.,  3.,  4.,  5.,  6.,  7.],
        [ 8.,  9., 10., 11., 12., 13., 14.],
        [15., 16., 17., 18., 19., 20., 21.],
        [22., 23., 24., 25., 26., 27., 28.],
        [29., 30., 31., 32., 33., 34., 35.]])
Taking the sum over columns:
tensor([ 75.,  80.,  85.,  90.,  95., 100., 105.])
Taking thep sum over rows:
tensor([ 28.,  77., 126., 175., 224.])
Taking the stdev over rows:
tensor([2.1602, 2.1602, 2.1602, 2.1602, 2.1602])

data.sum()

tensor(630.)

data = torch.tensor([[1, 2.2, 9.6], [4, -7.2, 6.3]])

row_avg = data.mean(dim=1)
col_avg = data.mean(dim=0)

print(row_avg.shape)
print(row_avg)

print(col_avg.shape)
print(col_avg)

torch.Size([2])
tensor([4.2667, 1.0333])
torch.Size([3])
tensor([ 2.5000, -2.5000,  7.9500])

# Initialize an example tensor
x = torch.Tensor([
                  [[1, 2], [3, 4]],
                  [[5, 6], [7, 8]],
                  [[9, 10], [11, 12]]
                 ])
x

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

x.shape

torch.Size([3, 2, 2])

# Access the 0th element, which is the first row
x[0] # Equivalent to x[0, :]

tensor([[1., 2.],
        [3., 4.]])

x[:, 0]

tensor([[ 1.,  2.],
        [ 5.,  6.],
        [ 9., 10.]])

matr = torch.arange(1, 16).view(5, 3)
print(matr)

tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12],
        [13, 14, 15]])

matr[0]

tensor([1, 2, 3])

matr[0, :]

tensor([1, 2, 3])

matr[:, 0]

tensor([ 1,  4,  7, 10, 13])

matr[0:3]

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

matr[:, 0:2]

tensor([[ 1,  2],
        [ 4,  5],
        [ 7,  8],
        [10, 11],
        [13, 14]])

matr[0:3, 0:2]

tensor([[1, 2],
        [4, 5],
        [7, 8]])

matr[0][2]

tensor(3)

matr[0:3, 2]

tensor([3, 6, 9])

matr[0:3][2]

tensor([7, 8, 9])

matr[0:3]

tensor([[1, 2, 3],
        [4, 5, 6],
        [7, 8, 9]])

matr[[0, 2, 4]]

tensor([[ 1,  2,  3],
        [ 7,  8,  9],
        [13, 14, 15]])

# Get the top left element of each element in our tensor
x[:, 0, 0]

tensor([1., 5., 9.])

x[:, :, :]

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

# Print x again to see our tensor
x

tensor([[[ 1.,  2.],
         [ 3.,  4.]],

        [[ 5.,  6.],
         [ 7.,  8.]],

        [[ 9., 10.],
         [11., 12.]]])

# Let's access the 0th and 1st elements, each twice
i = torch.tensor([0, 0, 1, 1])
x[i]

tensor([[[1., 2.],
         [3., 4.]],

        [[1., 2.],
         [3., 4.]],

        [[5., 6.],
         [7., 8.]],

        [[5., 6.],
         [7., 8.]]])

# Let's access the 0th elements of the 1st and 2nd elements
i = torch.tensor([1, 2])
j = torch.tensor([0])
x[i, j]

tensor([[ 5.,  6.],
        [ 9., 10.]])

x[0, 0, 0]

tensor(1.)

x[0, 0, 0].item()

1.0

# Create an example tensor
# requires_grad parameter tells PyTorch to store gradients
x = torch.tensor([2.], requires_grad=True)

# Print the gradient if it is calculated
# Currently None since x is a scalar
pp.pprint(x.grad)

None

# Calculating the gradient of y with respect to x
y = x * x * 3 # 3x^2
y.backward()
pp.pprint(x.grad) # d(y)/d(x) = d(3x^2)/d(x) = 6x = 12

tensor([12.])

z = x * x * 3 # 3x^2
z.backward()
pp.pprint(x.grad)

tensor([24.])

import torch.nn as nn

# Create the inputs
input = torch.ones(2,3,4)
# N* H_in -> N*H_out


# Make a linear layers transforming N,*,H_in dimensinal inputs to N,*,H_out
# dimensional outputs
linear = nn.Linear(4, 2)
linear_output = linear(input)
linear_output

tensor([[[-0.9886,  0.3687],
         [-0.9886,  0.3687],
         [-0.9886,  0.3687]],

        [[-0.9886,  0.3687],
         [-0.9886,  0.3687],
         [-0.9886,  0.3687]]], grad_fn=<AddBackward0>)

list(linear.parameters()) # Ax + b

[Parameter containing:
 tensor([[-0.3828,  0.1753, -0.3495, -0.0900],
         [ 0.2177,  0.3086,  0.3649, -0.4700]], requires_grad=True),
 Parameter containing:
 tensor([-0.3418, -0.0526], requires_grad=True)]

Data of shape [batch_size, feature_dim] # 4
[batch_size, output_dim] # 2

linear layer of shape (feature_dim, output_dim)

linear_output

tensor([[[-0.9886,  0.3687],
         [-0.9886,  0.3687],
         [-0.9886,  0.3687]],

        [[-0.9886,  0.3687],
         [-0.9886,  0.3687],
         [-0.9886,  0.3687]]], grad_fn=<AddBackward0>)

sigmoid = nn.Sigmoid()
output = sigmoid(linear_output)
output

tensor([[[0.2712, 0.5911],
         [0.2712, 0.5911],
         [0.2712, 0.5911]],

        [[0.2712, 0.5911],
         [0.2712, 0.5911],
         [0.2712, 0.5911]]], grad_fn=<SigmoidBackward0>)

block = nn.Sequential(
    nn.Linear(4, 2),
    nn.Sigmoid()
)

input = torch.ones(2,3,4)
output = block(input)
output

tensor([[[0.6106, 0.7368],
         [0.6106, 0.7368],
         [0.6106, 0.7368]],

        [[0.6106, 0.7368],
         [0.6106, 0.7368],
         [0.6106, 0.7368]]], grad_fn=<SigmoidBackward0>)

class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    # Call to the __init__ function of the super class
    super(MultilayerPerceptron, self).__init__()

    # Bookkeeping: Saving the initialization parameters
    self.input_size = input_size
    self.hidden_size = hidden_size

    # Defining of our model
    # There isn't anything specific about the naming of `self.model`. It could
    # be something arbitrary.
    self.model = nn.Sequential(
        nn.Linear(self.input_size, self.hidden_size),
        nn.ReLU(),
        nn.Linear(self.hidden_size, self.input_size),
        nn.Sigmoid()
    )

  def forward(self, x):
    output = self.model(x)
    return output

class MultilayerPerceptron(nn.Module):

  def __init__(self, input_size, hidden_size):
    # Call to the __init__ function of the super class
    super(MultilayerPerceptron, self).__init__()

    # Bookkeeping: Saving the initialization parameters
    self.input_size = input_size
    self.hidden_size = hidden_size

    # Defining of our layers
    self.linear = nn.Linear(self.input_size, self.hidden_size)
    self.relu = nn.ReLU()
    self.linear2 = nn.Linear(self.hidden_size, self.input_size)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    linear = self.linear(x)
    relu = self.relu(linear)
    linear2 = self.linear2(relu)
    output = self.sigmoid(linear2)
    return output

# Make a sample input
input = torch.randn(2, 5)

# Create our model
model = MultilayerPerceptron(5, 3)

# Pass our input through our model
model(input)

tensor([[0.5962, 0.5057, 0.5963, 0.5812, 0.4767],
        [0.5955, 0.5162, 0.5749, 0.5388, 0.5024]], grad_fn=<SigmoidBackward0>)

list(model.named_parameters())

[('linear.weight', Parameter containing:
  tensor([[ 0.0368, -0.0646,  0.0542, -0.0379,  0.4065],
          [ 0.2702,  0.2839,  0.1291, -0.2588, -0.0564],
          [-0.3855,  0.2294, -0.1773,  0.3299,  0.3396]], requires_grad=True)),
 ('linear.bias', Parameter containing:
  tensor([-0.1430,  0.0027,  0.4313], requires_grad=True)),
 ('linear2.weight', Parameter containing:
  tensor([[-0.4661,  0.4033, -0.1073],
          [-0.5408, -0.4103,  0.0023],
          [-0.2667,  0.5137,  0.0963],
          [-0.3463, -0.3437,  0.5733],
          [ 0.4121, -0.0583, -0.2666]], requires_grad=True)),
 ('linear2.bias', Parameter containing:
  tensor([0.3213, 0.1311, 0.2186, 0.2109, 0.0190], requires_grad=True))]

import torch.optim as optim

# Create the y data
y = torch.ones(10, 5)

# Add some noise to our goal y to generate our x
# We want out model to predict our original data, albeit the noise
x = y + torch.randn_like(y)
x

tensor([[ 1.5254, -0.5026,  0.2936,  1.1698,  0.6667],
        [ 1.2823,  0.2541,  1.1641,  0.6377,  1.6982],
        [ 0.0122,  0.4521,  0.4833,  1.1832,  0.3251],
        [ 2.0469,  0.1789,  0.2834, -0.3078,  1.8006],
        [ 0.7365, -0.5649,  0.1238,  0.8654,  1.8826],
        [ 2.6088,  3.1178,  2.4707,  2.5342,  1.5063],
        [ 0.1240, -0.1938,  0.8643,  1.2870,  2.2391],
        [ 1.3622,  3.9153,  1.2991,  0.1062,  1.6210],
        [ 0.5091,  1.2953,  1.0652,  1.6663,  1.0086],
        [-0.3073, -0.0082,  1.7911,  1.0753,  0.5141]])

# Instantiate the model
model = MultilayerPerceptron(5, 3)

# Define the optimizer
adam = optim.Adam(model.parameters(), lr=1e-1)

# Define loss using a predefined loss function
loss_function = nn.BCELoss()

# Calculate how our model is doing now
y_pred = model(x)
loss_function(y_pred, y).item()

0.7129776477813721

# Set the number of epoch, which determines the number of training iterations
n_epoch = 10

for epoch in range(n_epoch):
  # Set the gradients to 0
  adam.zero_grad()

  # Get the model predictions
  y_pred = model(x)

  # Get the loss
  loss = loss_function(y_pred, y)

  # Print stats
  print(f"Epoch {epoch}: traing loss: {loss}")

  # Compute the gradients
  loss.backward()

  # Take a step to optimize the weights
  adam.step()

Epoch 0: traing loss: 0.7129776477813721
Epoch 1: traing loss: 0.5745089650154114
Epoch 2: traing loss: 0.3950759768486023
Epoch 3: traing loss: 0.2251298576593399
Epoch 4: traing loss: 0.1061786338686943
Epoch 5: traing loss: 0.04316363483667374
Epoch 6: traing loss: 0.016426226124167442
Epoch 7: traing loss: 0.006115884054452181
Epoch 8: traing loss: 0.0022449910175055265
Epoch 9: traing loss: 0.0008585943141952157

list(model.parameters())

[Parameter containing:
 tensor([[ 0.6363,  0.1566,  1.0603,  0.5428,  1.0728],
         [ 0.7450,  0.1028,  0.2315, -0.4288,  0.3089],
         [ 0.5470,  1.1053,  0.6960,  0.6960,  0.7097]], requires_grad=True),
 Parameter containing:
 tensor([1.0350, 0.8359, 0.6403], requires_grad=True),
 Parameter containing:
 tensor([[0.7256, 0.4603, 1.2903],
         [1.4280, 0.7170, 0.3321],
         [1.3276, 0.4244, 1.3750],
         [1.1533, 0.5717, 1.1169],
         [1.0995, 0.2483, 0.8655]], requires_grad=True),
 Parameter containing:
 tensor([0.3305, 0.4047, 0.9623, 0.5595, 1.1514], requires_grad=True)]

# See how our model performs on the training data
y_pred = model(x)
y_pred

tensor([[0.9989, 0.9995, 0.9999, 0.9998, 0.9995],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9969, 0.9956, 0.9997, 0.9988, 0.9983],
        [0.9999, 1.0000, 1.0000, 1.0000, 0.9999],
        [0.9992, 0.9997, 1.0000, 0.9999, 0.9997],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9999, 0.9999, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 0.9999, 1.0000, 1.0000, 1.0000],
        [0.9992, 0.9994, 1.0000, 0.9998, 0.9997]], grad_fn=<SigmoidBackward0>)

# Create test data and check how our model performs on it
x2 = y + torch.randn_like(y)
y_pred = model(x2)
y_pred

tensor([[1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [0.9979, 0.9996, 0.9999, 0.9996, 0.9992],
        [0.9948, 0.9940, 0.9993, 0.9979, 0.9964],
        [1.0000, 0.9999, 1.0000, 1.0000, 1.0000],
        [0.9982, 0.9874, 0.9998, 0.9989, 0.9983],
        [0.9980, 0.9989, 0.9998, 0.9995, 0.9989],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000, 1.0000, 1.0000]], grad_fn=<SigmoidBackward0>)

# Our raw data, which consists of sentences
corpus = [
          "We always come to Paris",
          "The professor is from Australia",
          "I live in Stanford",
          "He comes from Taiwan",
          "The capital of Turkey is Ankara"
         ]

# The preprocessing function we will use to generate our training examples
# Our function is a simple one, we lowercase the letters
# and then tokenize the words.
def preprocess_sentence(sentence):
  return sentence.lower().split()

# Create our training set
train_sentences = [preprocess_sentence(sent) for sent in corpus]
train_sentences

[['we', 'always', 'come', 'to', 'paris'],
 ['the', 'professor', 'is', 'from', 'australia'],
 ['i', 'live', 'in', 'stanford'],
 ['he', 'comes', 'from', 'taiwan'],
 ['the', 'capital', 'of', 'turkey', 'is', 'ankara']]

# Set of locations that appear in our corpus
locations = set(["australia", "ankara", "paris", "stanford", "taiwan", "turkey"])

# Our train labels
train_labels = [[1 if word in locations else 0 for word in sent] for sent in train_sentences]
train_labels

[[0, 0, 0, 0, 1],
 [0, 0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1],
 [0, 0, 0, 1, 0, 1]]

# Find all the unique words in our corpus
vocabulary = set(w for s in train_sentences for w in s)
vocabulary

{'always',
 'ankara',
 'australia',
 'capital',
 'come',
 'comes',
 'from',
 'he',
 'i',
 'in',
 'is',
 'live',
 'of',
 'paris',
 'professor',
 'stanford',
 'taiwan',
 'the',
 'to',
 'turkey',
 'we'}

# Add the unknown token to our vocabulary
vocabulary.add("<unk>")

# Add the <pad> token to our vocabulary
vocabulary.add("<pad>")

# Function that pads the given sentence
# We are introducing this function here as an example
# We will be utilizing it later in the tutorial
def pad_window(sentence, window_size, pad_token="<pad>"):
  window = [pad_token] * window_size
  return window + sentence + window

# Show padding example
window_size = 2
pad_window(train_sentences[0], window_size=window_size)

['<pad>', '<pad>', 'we', 'always', 'come', 'to', 'paris', '<pad>', '<pad>']

# We are just converting our vocabularly to a list to be able to index into it
# Sorting is not necessary, we sort to show an ordered word_to_ind dictionary
# That being said, we will see that having the index for the padding token
# be 0 is convenient as some PyTorch functions use it as a default value
# such as nn.utils.rnn.pad_sequence, which we will cover in a bit
ix_to_word = sorted(list(vocabulary))

# Creating a dictionary to find the index of a given word
word_to_ix = {word: ind for ind, word in enumerate(ix_to_word)}
word_to_ix

{'<pad>': 0,
 '<unk>': 1,
 'always': 2,
 'ankara': 3,
 'australia': 4,
 'capital': 5,
 'come': 6,
 'comes': 7,
 'from': 8,
 'he': 9,
 'i': 10,
 'in': 11,
 'is': 12,
 'live': 13,
 'of': 14,
 'paris': 15,
 'professor': 16,
 'stanford': 17,
 'taiwan': 18,
 'the': 19,
 'to': 20,
 'turkey': 21,
 'we': 22}

ix_to_word[1]

'<unk>'

# Given a sentence of tokens, return the corresponding indices
def convert_token_to_indices(sentence, word_to_ix):
  indices = []
  for token in sentence:
    # Check if the token is in our vocabularly. If it is, get it's index.
    # If not, get the index for the unknown token.
    if token in word_to_ix:
      index = word_to_ix[token]
    else:
      index = word_to_ix["<unk>"]
    indices.append(index)
  return indices

# More compact version of the same function
def _convert_token_to_indices(sentence, word_to_ix):
  return [word_to_ind.get(token, word_to_ix["<unk>"]) for token in sentence]

# Show an example
example_sentence = ["we", "always", "come", "to", "kuwait"]
example_indices = convert_token_to_indices(example_sentence, word_to_ix)
restored_example = [ix_to_word[ind] for ind in example_indices]

print(f"Original sentence is: {example_sentence}")
print(f"Going from words to indices: {example_indices}")
print(f"Going from indices to words: {restored_example}")

Original sentence is: ['we', 'always', 'come', 'to', 'kuwait']
Going from words to indices: [22, 2, 6, 20, 1]
Going from indices to words: ['we', 'always', 'come', 'to', '<unk>']

# Converting our sentences to indices
example_padded_indices = [convert_token_to_indices(s, word_to_ix) for s in train_sentences]
example_padded_indices

[[22, 2, 6, 20, 15],
 [19, 16, 12, 8, 4],
 [10, 13, 11, 17],
 [9, 7, 8, 18],
 [19, 5, 14, 21, 12, 3]]

# Creating an embedding table for our words
embedding_dim = 5
embeds = nn.Embedding(len(vocabulary), embedding_dim)

# Printing the parameters in our embedding table
list(embeds.parameters())

[Parameter containing:
 tensor([[ 1.2750, -1.0534, -1.2682, -0.0481,  1.9066],
         [-0.8231, -0.8144, -0.2719, -0.2095,  0.0402],
         [-1.1930, -0.3141, -1.1884, -0.7027,  1.1982],
         [ 0.9499, -0.9259, -1.1483, -0.7451, -0.2048],
         [-0.0546,  0.3709,  0.8854,  0.9902,  0.2371],
         [-0.4857,  0.6913, -0.9795,  0.1207, -0.4599],
         [ 0.0851, -1.1664, -0.6251,  1.2031,  0.5521],
         [ 1.5833, -0.1220,  0.9138,  1.5947, -1.3666],
         [ 1.4058, -1.5917,  0.1274, -0.2661, -0.5972],
         [ 0.9353, -0.4279, -1.4017,  1.1777,  1.2239],
         [-0.3380, -0.5881,  0.5774, -0.8471,  0.6541],
         [ 0.6196,  0.8502, -0.3459, -0.0844,  0.2116],
         [ 1.6801, -0.7561,  0.5900, -1.2767,  0.4213],
         [ 0.0068,  0.9612, -2.5982, -0.5671,  2.8223],
         [-1.1635, -1.0546,  1.8576, -2.1281, -0.7757],
         [-1.4659, -0.7697, -0.0552,  1.2393, -0.6420],
         [-1.2702, -0.4198,  0.2354,  0.7255,  0.8039],
         [ 0.7282, -0.9687, -0.4723, -0.1911,  1.4960],
         [-0.9160, -0.4882,  0.0438, -2.7145,  1.1594],
         [ 1.7300, -0.2320, -1.5453, -1.3054, -0.9023],
         [-0.1510, -1.0970,  0.1899,  0.1046, -0.9781],
         [ 0.5311,  1.8488, -0.7348, -1.7153,  0.2808],
         [-0.2691,  0.2026, -0.1008,  0.3002, -0.9762]], requires_grad=True)]

# Get the embedding for the word Paris
index = word_to_ix["paris"]
index_tensor = torch.tensor(index, dtype=torch.long)
paris_embed = embeds(index_tensor)
paris_embed

tensor([-1.4659, -0.7697, -0.0552,  1.2393, -0.6420],
       grad_fn=<EmbeddingBackward0>)

# We can also get multiple embeddings at once
index_paris = word_to_ix["paris"]
index_ankara = word_to_ix["ankara"]
indices = [index_paris, index_ankara]
indices_tensor = torch.tensor(indices, dtype=torch.long)
embeddings = embeds(indices_tensor)
embeddings

tensor([[-1.4659, -0.7697, -0.0552,  1.2393, -0.6420],
        [ 0.9499, -0.9259, -1.1483, -0.7451, -0.2048]],
       grad_fn=<EmbeddingBackward0>)

from torch.utils.data import DataLoader
from functools import partial

def custom_collate_fn(batch, window_size, word_to_ix):
  # Break our batch into the training examples (x) and labels (y)
  # We are turning our x and y into tensors because nn.utils.rnn.pad_sequence
  # method expects tensors. This is also useful since our model will be
  # expecting tensor inputs.
  x, y = zip(*batch)

  # Now we need to window pad our training examples. We have already defined a
  # function to handle window padding. We are including it here again so that
  # everything is in one place.
  def pad_window(sentence, window_size, pad_token="<pad>"):
    window = [pad_token] * window_size
    return window + sentence + window

  # Pad the train examples.
  x = [pad_window(s, window_size=window_size) for s in x]

  # Now we need to turn words in our training examples to indices. We are
  # copying the function defined earlier for the same reason as above.
  def convert_tokens_to_indices(sentence, word_to_ix):
    return [word_to_ix.get(token, word_to_ix["<unk>"]) for token in sentence]

  # Convert the train examples into indices.
  x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

  # We will now pad the examples so that the lengths of all the example in
  # one batch are the same, making it possible to do matrix operations.
  # We set the batch_first parameter to True so that the returned matrix has
  # the batch as the first dimension.
  pad_token_ix = word_to_ix["<pad>"]

  # pad_sequence function expects the input to be a tensor, so we turn x into one
  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

  # We will also pad the labels. Before doing so, we will record the number
  # of labels so that we know how many words existed in each example.
  lengths = [len(label) for label in y]
  lenghts = torch.LongTensor(lengths)

  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

  # We are now ready to return our variables. The order we return our variables
  # here will match the order we read them in our training loop.
  return x_padded, y_padded, lenghts

def _custom_collate_fn(batch, window_size, word_to_ix):
  # Prepare the datapoints
  x, y = zip(*batch)
  x = [pad_window(s, window_size=window_size) for s in x]
  x = [convert_tokens_to_indices(s, word_to_ix) for s in x]

  # Pad x so that all the examples in the batch have the same size
  pad_token_ix = word_to_ix["<pad>"]
  x = [torch.LongTensor(x_i) for x_i in x]
  x_padded = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=pad_token_ix)

  # Pad y and record the length
  lengths = [len(label) for label in y]
  lenghts = torch.LongTensor(lengths)
  y = [torch.LongTensor(y_i) for y_i in y]
  y_padded = nn.utils.rnn.pad_sequence(y, batch_first=True, padding_value=0)

  return x_padded, y_padded, lenghts

# Parameters to be passed to the DataLoader
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate the DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Go through one loop
counter = 0
for batched_x, batched_y, batched_lengths in loader:
  print(f"Iteration {counter}")
  print("Batched Input:")
  print(batched_x)
  print("Batched Labels:")
  print(batched_y)
  print("Batched Lengths:")
  print(batched_lengths)
  print("")
  counter += 1

Iteration 0
Batched Input:
tensor([[ 0,  0,  9,  7,  8, 18,  0,  0,  0],
        [ 0,  0, 19, 16, 12,  8,  4,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0],
        [0, 0, 0, 0, 1]])
Batched Lengths:
tensor([4, 5])

Iteration 1
Batched Input:
tensor([[ 0,  0, 22,  2,  6, 20, 15,  0,  0],
        [ 0,  0, 10, 13, 11, 17,  0,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0]])
Batched Lengths:
tensor([5, 4])

Iteration 2
Batched Input:
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])
Batched Labels:
tensor([[0, 0, 0, 1, 0, 1]])
Batched Lengths:
tensor([6])

# Print the original tensor
print(f"Original Tensor: ")
print(batched_x)
print("")

# Create the 2 * 2 + 1 chunks
chunk = batched_x.unfold(1, window_size*2 + 1, 1)
print(f"Windows: ")
print(chunk)

Original Tensor: 
tensor([[ 0,  0, 19,  5, 14, 21, 12,  3,  0,  0]])

Windows: 
tensor([[[ 0,  0, 19,  5, 14],
         [ 0, 19,  5, 14, 21],
         [19,  5, 14, 21, 12],
         [ 5, 14, 21, 12,  3],
         [14, 21, 12,  3,  0],
         [21, 12,  3,  0,  0]]])

class WordWindowClassifier(nn.Module):

  def __init__(self, hyperparameters, vocab_size, pad_ix=0):
    super(WordWindowClassifier, self).__init__()

    """ Instance variables """
    self.window_size = hyperparameters["window_size"]
    self.embed_dim = hyperparameters["embed_dim"]
    self.hidden_dim = hyperparameters["hidden_dim"]
    self.freeze_embeddings = hyperparameters["freeze_embeddings"]

    """ Embedding Layer
    Takes in a tensor containing embedding indices, and returns the
    corresponding embeddings. The output is of dim
    (number_of_indices * embedding_dim).

    If freeze_embeddings is True, set the embedding layer parameters to be
    non-trainable. This is useful if we only want the parameters other than the
    embeddings parameters to change.

    """
    self.embeds = nn.Embedding(vocab_size, self.embed_dim, padding_idx=pad_ix)
    if self.freeze_embeddings:
      self.embed_layer.weight.requires_grad = False

    """ Hidden Layer
    """
    full_window_size = 2 * window_size + 1
    self.hidden_layer = nn.Sequential(
      nn.Linear(full_window_size * self.embed_dim, self.hidden_dim),
      nn.Tanh()
    )

    """ Output Layer
    """
    self.output_layer = nn.Linear(self.hidden_dim, 1)

    """ Probabilities
    """
    self.probabilities = nn.Sigmoid()

  def forward(self, inputs):
    """
    Let B:= batch_size
        L:= window-padded sentence length
        D:= self.embed_dim
        S:= self.window_size
        H:= self.hidden_dim

    inputs: a (B, L) tensor of token indices
    """
    B, L = inputs.size()

    """
    Reshaping.
    Takes in a (B, L) LongTensor
    Outputs a (B, L~, S) LongTensor
    """
    # Fist, get our word windows for each word in our input.
    token_windows = inputs.unfold(1, 2 * self.window_size + 1, 1)
    _, adjusted_length, _ = token_windows.size()

    # Good idea to do internal tensor-size sanity checks, at the least in comments!
    assert token_windows.size() == (B, adjusted_length, 2 * self.window_size + 1)

    """
    Embedding.
    Takes in a torch.LongTensor of size (B, L~, S)
    Outputs a (B, L~, S, D) FloatTensor.
    """
    embedded_windows = self.embeds(token_windows)

    """
    Reshaping.
    Takes in a (B, L~, S, D) FloatTensor.
    Resizes it into a (B, L~, S*D) FloatTensor.
    -1 argument "infers" what the last dimension should be based on leftover axes.
    """
    embedded_windows = embedded_windows.view(B, adjusted_length, -1)

    """
    Layer 1.
    Takes in a (B, L~, S*D) FloatTensor.
    Resizes it into a (B, L~, H) FloatTensor
    """
    layer_1 = self.hidden_layer(embedded_windows)

    """
    Layer 2
    Takes in a (B, L~, H) FloatTensor.
    Resizes it into a (B, L~, 1) FloatTensor.
    """
    output = self.output_layer(layer_1)

    """
    Softmax.
    Takes in a (B, L~, 1) FloatTensor of unnormalized class scores.
    Outputs a (B, L~, 1) FloatTensor of (log-)normalized class scores.
    """
    output = self.probabilities(output)
    output = output.view(B, -1)

    return output

# Prepare the data
data = list(zip(train_sentences, train_labels))
batch_size = 2
shuffle = True
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=window_size, word_to_ix=word_to_ix)

# Instantiate a DataLoader
loader = DataLoader(data, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)

# Initialize a model
# It is useful to put all the model hyperparameters in a dictionary
model_hyperparameters = {
    "batch_size": 4,
    "window_size": 2,
    "embed_dim": 25,
    "hidden_dim": 25,
    "freeze_embeddings": False,
}

vocab_size = len(word_to_ix)
model = WordWindowClassifier(model_hyperparameters, vocab_size)

# Define an optimizer
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Define a loss function, which computes to binary cross entropy loss
def loss_function(batch_outputs, batch_labels, batch_lengths):
    # Calculate the loss for the whole batch
    bceloss = nn.BCELoss()
    loss = bceloss(batch_outputs, batch_labels.float())

    # Rescale the loss. Remember that we have used lengths to store the
    # number of words in each training example
    loss = loss / batch_lengths.sum().float()

    return loss

# Function that will be called in every epoch
def train_epoch(loss_function, optimizer, model, loader):

  # Keep track of the total loss for the batch
  total_loss = 0
  for batch_inputs, batch_labels, batch_lengths in loader:
    # Clear the gradients
    optimizer.zero_grad()
    # Run a forward pass
    outputs = model.forward(batch_inputs)
    # Compute the batch loss
    loss = loss_function(outputs, batch_labels, batch_lengths)
    # Calculate the gradients
    loss.backward()
    # Update the parameteres
    optimizer.step()
    total_loss += loss.item()

  return total_loss


# Function containing our main training loop
def train(loss_function, optimizer, model, loader, num_epochs=10000):

  # Iterate through each epoch and call our train_epoch function
  for epoch in range(num_epochs):
    epoch_loss = train_epoch(loss_function, optimizer, model, loader)
    if epoch % 100 == 0: print(epoch_loss)

num_epochs = 1000
train(loss_function, optimizer, model, loader, num_epochs=num_epochs)

0.32406048476696014
0.2593139484524727
0.1962246038019657
0.1476144790649414
0.12322462722659111
0.09615403413772583
0.07315488904714584
0.05424202140420675
0.05166797339916229
0.041413815692067146

# Create test sentences
test_corpus = ["She comes from Paris"]
test_sentences = [s.lower().split() for s in test_corpus]
test_labels = [[0, 0, 0, 1]]

# Create a test loader
test_data = list(zip(test_sentences, test_labels))
batch_size = 1
shuffle = False
window_size = 2
collate_fn = partial(custom_collate_fn, window_size=2, word_to_ix=word_to_ix)
test_loader = torch.utils.data.DataLoader(test_data,
                                           batch_size=1,
                                           shuffle=False,
                                           collate_fn=collate_fn)

for test_instance, labels, _ in test_loader:
  outputs = model.forward(test_instance)
  print(labels)
  print(outputs)

tensor([[0, 0, 0, 1]])
tensor([[0.0805, 0.0333, 0.0686, 0.6858]], grad_fn=<ViewBackward0>)

CS224N：PyTorch 教程（22 年冬季）¶

作者：Dilara Soylu、Ethan Chi¶

简介¶

Quiz¶

Exercise:¶

Autograd¶

神经网络模块¶

Linear Layer¶

Other Module Layers¶

Activation Function Layer¶

Putting the Layers Together¶

Custom Modules¶

Optimization¶

Demo: Word Window Classification¶

Data¶

Preprocessing¶

Converting Words to Embeddings¶

Batching Sentences¶

Model¶

Training¶

Prediction¶