深度学习--全连接层、高阶应用、GPU加速
-
Cross Entropy Loss:交叉熵损失
1948年,香农将统计物理中熵的概念,引申到信道通信的过程中,从而开创了信息论这门学科,把信息中排除了冗余后的平均信息量称为“信息熵”。香农定义的“熵”又被称为香农熵或信息熵,即
标记概率空间中所有可能的样本,表示该样本的出现几率,是和单位选取相关的任意常数。
在信息学里信息量大代表着数据离散范围小,不确定性小。香农作为一个信息学家,他关心的是信息的正确传递,所以信息熵代表着信息传递的不确定性的大小。所以在信息学上,使用香农公式算出来的这个值,在信息学上叫做信息熵值,在熵权法中叫做冗余度值或者叫偏离度值,它的本来含义是指一个确定无疑的信息源发送出来的信息,受到干扰以后,衡量偏离了原始精确信息的程度。离散度越大,计算得这个值越小,则收到的信息越不可靠,得到的信息越小。这个值越大,则收到的信息越可靠,得到的信息越多。
a=torch.full([4],1/4
#tensor([0.2500, 0.2500, 0.2500, 0.2500]
#计算交叉熵
-(a*torch.log2(a.sum(
#tensor(2.
交叉熵在神经网络中作为损失函数,p表示真实标记的分布,q则为训练后的模型的预测标记分布,交叉熵损失函数可以衡量p与q的相似性。交叉熵作为损失函数还有一个好处是使用sigmoid函数在梯度下降时能避免均方误差损失函数学习速率降低的问题,因为学习速率可以被输出的误差所控制。
MNIST再实现
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
batch_size=200
learning_rate=0.01
epochs=10
#加载数据集DataLoader(数据位置,batch_size,shuffle是否打乱,num_workers=4:4线程处理)
#torchvision.datasets.MNIST(root,train,transform,download root指下载到的位置,train指是否下载训练集,transform指对图片进行转换后返回,download指是否下载
#torchvision.transforms([transforms.ToTensor(,transforms.Normalize((mean,(std]
#transforms.ToTensor(做了三件事:1.归一化/255 2.数据类型转为torch.FloatTensor 3.shape(H,W,C->(C,H,W
#transforms.Normalize((mean,(std :用均值和标准差对张量图像进行归一化
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(,
transforms.Normalize((0.1307,, (0.3081,
],
batch_size=batch_size, shuffle=True
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(,
transforms.Normalize((0.1307,, (0.3081,
],
batch_size=batch_size, shuffle=True
w1, b1 = torch.randn(200, 784, requires_grad=True,\
torch.zeros(200, requires_grad=True
w2, b2 = torch.randn(200, 200, requires_grad=True,\
torch.zeros(200, requires_grad=True
w3, b3 = torch.randn(10, 200, requires_grad=True,\
torch.zeros(10, requires_grad=True
torch.nn.init.kaiming_normal_(w1
torch.nn.init.kaiming_normal_(w2
torch.nn.init.kaiming_normal_(w3
def forward(x:
x = x@w1.t( + b1
x = F.relu(x
x = x@w2.t( + b2
x = F.relu(x
x = x@w3.t( + b3
x = F.relu(x
return x
optimizer = optim.SGD([w1, b1, w2, b2, w3, b3], lr=learning_rate
criteon = nn.CrossEntropyLoss(
for epoch in range(epochs:
for batch_idx, (data, target in enumerate(train_loader:
data = data.view(-1, 28*28
logits = forward(data
# print(data.shape, target.shape,logits.shape
loss = criteon(logits, target
optimizer.zero_grad(
loss.backward(
# print(w1.grad.norm(, w2.grad.norm(
optimizer.step(
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data, len(train_loader.dataset,
100. * batch_idx / len(train_loader, loss.item(
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28
logits = forward(data
test_loss += criteon(logits, target.item(
pred = logits.data.max(1[1]
#print(pred
correct += pred.eq(target.data.sum(
test_loss /= len(test_loader.dataset
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%\n'.format(
test_loss, correct, len(test_loader.dataset,
100. * correct / len(test_loader.dataset
全连接层
import torch
import torch.nn as nn
import torch.nn.functional as F
x=torch.randn(1,784
x.shape
#torch.Size([1, 784]
# nn.Linear(输入、输出)
layer1 = nn.Linear(784,200
layer2 = nn.Linear(200,200
layer3 = nn.Linear(200,10
x=layer1(x
x=F.relu(x,inplace=True
x.shape
#torch.Size([1, 200]
x=layer2(x
x=F.relu(x,inplace=True
x.shape
#torch.Size([1, 200]
x=layer3(x
x=F.relu(x,inplace=True
x.shape
#torch.Size([1, 10]
网络定义的高阶用法
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class MLP(nn.Module:
def __init__(self:
super(MLP,self.__init__(
self.model = nn.Sequential(
nn.Linear(784,200,
nn.ReLU(inplace=True,
nn.Linear(200,200,
nn.ReLU(inplace=True,
nn.Linear(200,10,
nn.ReLU(inplace=True,
def forward(self,x:
x=self.model(x
return x
net= MLP(
optimizer = optim.SGD(net.parameters(,lr=learning_rate
criteon = nn.CrossEntropyLoss(
其他的激活函数 SELU、softplus、
GPU加速
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
batch_size=200
learning_rate=0.01
epochs=10
#加载数据集DataLoader(数据位置,batch_size,shuffle是否打乱,num_workers=4:4线程处理)
#torchvision.datasets.MNIST(root,train,transform,download root指下载到的位置,train指是否下载训练集,transform指对图片进行转换后返回,download指是否下载
#torchvision.transforms([transforms.ToTensor(,transforms.Normalize((mean,(std]
#transforms.ToTensor(做了三件事:1.归一化/255 2.数据类型转为torch.FloatTensor 3.shape(H,W,C->(C,H,W
#transforms.Normalize((mean,(std :用均值和标准差对张量图像进行归一化
train_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=True, download=True,
transform=transforms.Compose([
transforms.ToTensor(,
transforms.Normalize((0.1307,, (0.3081,
],
batch_size=batch_size, shuffle=True
test_loader = torch.utils.data.DataLoader(
datasets.MNIST('../data', train=False, transform=transforms.Compose([
transforms.ToTensor(,
transforms.Normalize((0.1307,, (0.3081,
],
batch_size=batch_size, shuffle=True
class MLP(nn.Module:
def __init__(self:
super(MLP, self.__init__(
self.model = nn.Sequential(
nn.Linear(784, 200,
nn.LeakyReLU(inplace=True,
nn.Linear(200, 200,
nn.LeakyReLU(inplace=True,
nn.Linear(200, 10,
nn.LeakyReLU(inplace=True,
def forward(self,x:
x=self.model(x
return x
##重点重点!
device=torch.device('cuda:0'
net = MLP(.to(device
optimizer = optim.SGD(net.parameters(,lr=learning_rate
criteon = nn.CrossEntropyLoss(.to(device
for epoch in range(epochs:
for batch_idx, (data, target in enumerate(train_loader:
data = data.view(-1, 28*28
data,target = data.to(device,target.to(device
logits = net(data
# print(data.shape, target.shape,logits.shape
loss = criteon(logits, target
optimizer.zero_grad(
loss.backward(
# print(w1.grad.norm(, w2.grad.norm(
optimizer.step(
if batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data, len(train_loader.dataset,
100. * batch_idx / len(train_loader, loss.item(
test_loss = 0
correct = 0
for data, target in test_loader:
data = data.view(-1, 28 * 28
data, target = data.to(device, target.to(device
logits = net(data
test_loss += criteon(logits, target.item(
pred = logits.data.max(1[1]
#print(pred
correct += pred.eq(target.data.sum(
test_loss /= len(test_loader.dataset
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%\n'.format(
test_loss, correct, len(test_loader.dataset,
100. * correct / len(test_loader.dataset