CNN基础 & CV基本应用

卷积的概念不多叙述。

其中卷积维度相关的notation见下图:LS5XNSP.png

LeNet

LeNet是用于对28*28的灰度图片分类的网络。其架构简化为下图:OQ69QN.png

对这一架构复现的代码如下:(隐去了引入数据集这一步)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import torch
from torch import nn
from torchvision.transforms import ToPILImage

net = nn.Sequential( # 简单的写出网络
nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))

X = torch.randn([1, 1, 28, 28], dtype=torch.float32) # 用这四行代码试验各层的大小对不对
for layer in net:
X = layer(X)
print(layer, X.shape)

def train(dataLoader, testLoader, trainModel): # 这里trainModel就传入上面定义的net
net = trainModel
optimizer = torch.optim.SGD(net.parameters(), lr=0.9) # 声明梯度下降的方式为adam
criterion = nn.CrossEntropyLoss() # 定义交叉熵函数作为损失函数

def init_weights(m): # 初始化参数,极其重要,且极大加快了训练速度
if type(m) == nn.Linear or type(m) == nn.Conv2d:
nn.init.xavier_uniform_(m.weight)
net.apply(init_weights)

for epoch in range(10): # 跑10个epoch(一个epoch就是对样本集所有样本的遍历)
runningLoss = 0.0 # 初始化loss
correct = 0 # 训练集的正确计数
testCorrect = 0 # 测试集的正确计数
for i, data in enumerate(dataLoader, 0): # 枚举loader,写法固定为index,data
inputs, labels = data # data中就是我们刚才定义的__getitem__的顺序
optimizer.zero_grad() # 初始化梯度,必须要有
outputs = net(inputs) # 把data中的样本放入net而不放入标签,得到outputs输出

pred = torch.max(outputs, 1)[1]
correct += (pred == labels).sum().item() # 判断是否正确分类,用来判断这个epoch的指标

loss = criterion(outputs, labels) # 根据outputs和原有的标签计算交叉熵
loss.backward() # 反向传播计算更新参数,必须要有
optimizer.step() # 更新参数
runningLoss += float(loss.data) # 把一个epoch中的loss更新

for testData, labels in testLoader: # 跑完一个epoch之后在test集上检验一下
outputs = net(testData)
pred = torch.max(outputs, 1)[1]
testCorrect += (pred == labels).sum().item() # 判断是否正确分类,用来判断这个epoch的指标

print(f'epoch{epoch}:', runningLoss, f'trainning accuracy:{correct/len(dataLoader.dataset)}', f'test acc:{testCorrect/len(testLoader.dataset)}')#每一个epoch都打印一次loss
print('finish!')

AlexNet进行迁移学习

其架构见下图,其中论文中描绘的不很清楚,也有部分出入。4QZ0LE4F.png


以AlexNet为例,实现迁移学习:
代码块1:导入torch的alexnet:

1
2
3
import torch
model = torch.hub.load('pytorch/vision:v0.10.0', 'alexnet', pretrained=True)
model.eval()

代码块2:载入数据集(狗种类数据集)和数据集处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from PIL import Image
import torch.nn as nn
import torch.optim as optim
from os import listdir
import os
from torchvision import transforms
import pandas as pd
from sklearn.model_selection import train_test_split

class DogDataset(torch.utils.data.Dataset):
def __init__(self, img_paths, img_labels, size_of_images):
self.img_paths = img_paths
self.img_labels = img_labels
self.size_of_images = size_of_images

def __len__(self):
return len(self.img_paths)

def __getitem__(self, index):
PIL_IMAGE = Image.open(self.img_paths[index]).resize(self.size_of_images) # 图片改变大小为224 224 3,这是alexnet的大小
TENSOR_IMAGE = transforms.ToTensor()(PIL_IMAGE) # 图像转化为张量
label = self.img_labels[index]
return TENSOR_IMAGE, label

train_paths = [] # 训练集图片的路径
test_paths = []
labels = [] # 标签数组


# normalize = transforms.Normalize( # 图像转化为张量如要指定normalize的标准差和均值,就用这个transform代替transforms.ToTensor()
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]
# )
# transform = transforms.Compose([transforms.ToTensor(), normalize])

train_paths_lir = r'D:\deepLearning\dog-breed-identification\train' # 指定读取路径
for path in listdir(train_paths_lir):
train_paths.append(os.path.join(train_paths_lir, path))

labels_data = pd.read_csv(r'D:\deepLearning\dog-breed-identification\labels.csv') # 用panda去读数据集的csv
labels_data = pd.DataFrame(labels_data)

# 把字符标签从0-119编号,因为数据集共120种类
size_mapping = {}
value = 0
size_mapping = dict(labels_data['breed'].value_counts())
for key in size_mapping:
size_mapping[key] = value
value += 1
labels = labels_data['breed'].map(size_mapping)
labels = list(labels)
X_train, X_test, y_train, y_test = train_test_split(train_paths, labels, test_size=0.2) # 划分train和test数据集
train_set = DogDataset(X_train, y_train, (224, 224))
test_set = DogDataset(X_test, y_test, (224, 224))
# 指定batch_size
train_loader = torch.utils.data.DataLoader(train_set, batch_size=64)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=64)

代码块3:指定训练过程、冻结参数和新建自己的分类层

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
model.classifier[6] = nn.Linear(in_features=4096, out_features=120)  # 在AlexNet在这个位置修改成我们想要的分类120种狗狗
# 对于模型的每个权重,固定参数
for param in model.parameters():
param.requires_grad = False
# 不固定最后一层
for param in model.classifier[6].parameters():
param.requires_grad = True

def train(dataLoader, trainModel): # 这里trainModel就传入
net = trainModel
optimizer = optim.Adam(net.parameters(), lr=0.001) # 声明梯度下降的方式为adam
criterion = nn.CrossEntropyLoss() # 定义交叉熵函数作为损失函数
for epoch in range(10): # 跑10个epoch(一个epoch就是对样本集所有样本的遍历)
runningLoss = 0.0 # 初始化loss
correct = 0
for i, data in enumerate(dataLoader, 0): # 枚举loader,写法固定为index,data
inputs, labels = data # data中就是我们刚才定义的__getitem__的顺序
optimizer.zero_grad() # 初始化梯度,必须要有
outputs = net(inputs) # 把data中的样本放入net而不放入标签,得到outputs输出
outputs = nn.Softmax(dim=1)(outputs)

pred = torch.max(outputs, 1)[1]
correct += (pred == labels).sum().item() # 判断是否正确分类,用来判断这个epoch的指标

loss = criterion(outputs, labels) # 根据outputs和原有的标签计算交叉熵
loss.backward() # 反向传播计算更新参数,必须要有
optimizer.step() # 更新参数
runningLoss += float(loss.data) # 把一个epoch中的loss更新
print(i)

print(f'epoch{epoch}:', runningLoss, f'trainning accuracy:{correct/10222}')#每一个epoch都打印一次loss,10222是样本集大小
print('finish!')

AlexNet复现

直接上代码,kaiming初始化真的猛。与LeNet相比,这里的主要变化是使用更小的学习速率训练,这是因为网络更深更广、图像分辨率更高,训练卷积神经网络就更昂贵。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
from torch import nn
import torchvision

net = nn.Sequential( # 声明网络结构
nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(9216, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096), nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 10))

dataset = torchvision.datasets.FashionMNIST('./fashionMNIST', download=True) # 用torch官方方式下载数据集
transform = torchvision.transforms.Compose([ # 图像转张量用的函数
torchvision.transforms.ToTensor()])

class newDataset(torch.utils.data.Dataset): # 将数据集处理成能被用于学习的样子
def __init__(self, data, label):
self.data = data
self.label = label

def __len__(self):
return len(self.data)

def __getitem__(self, index):
img = transform(ToPILImage()(self.data[index]).resize((227, 227)))
return img, self.label[index]
# 只取前5000张,不然数据量太大了(泪目)
trainLoader = torch.utils.data.DataLoader(newDataset(dataset.train_data[:5000], dataset.train_labels[:5000]), batch_size=128)
testLoader = torch.utils.data.DataLoader(newDataset(dataset.test_data[:100], dataset.test_labels[:100]), batch_size=128)

def train(dataLoader, testLoader, trainModel):
net = trainModel
optimizer = torch.optim.SGD(net.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss() # 定义交叉熵函数作为损失函数

def init_weights(m): # 初始化参数,极其重要,且极大加快了训练速度
if type(m) == nn.Linear or type(m) == nn.Conv2d:
nn.init.kaiming_uniform_(m.weight) # kaiming初始化 过于厉害
net.apply(init_weights)

for epoch in range(10): # 跑10个epoch(一个epoch就是对样本集所有样本的遍历)
runningLoss = 0.0 # 初始化loss
correct = 0 # 训练集的正确计数
testCorrect = 0 # 测试集的正确计数
for i, data in enumerate(dataLoader, 0): # 枚举loader,写法固定为index,data
inputs, labels = data # data中就是我们刚才定义的__getitem__的顺序
optimizer.zero_grad() # 初始化梯度,必须要有
outputs = net(inputs) # 把data中的样本放入net而不放入标签,得到outputs输出

pred = torch.max(outputs, 1)[1]
correct += (pred == labels).sum().item() # 判断是否正确分类,用来判断这个epoch的指标

loss = criterion(outputs, labels) # 根据outputs和原有的标签计算交叉熵
loss.backward() # 反向传播计算更新参数,必须要有
optimizer.step() # 更新参数
runningLoss += float(loss.data) # 把一个epoch中的loss更新
print(f'now batch {i}')

for testData, labels in testLoader: # 跑完一个epoch之后在test集上检验一下
outputs = net(testData)
pred = torch.max(outputs, 1)[1]
testCorrect += (pred == labels).sum().item() # 判断是否正确分类,用来判断这个epoch的指标

print(f'epoch{epoch}:', runningLoss, f'trainning accuracy:{correct/len(dataLoader.dataset)}', f'test acc:{testCorrect/len(testLoader.dataset)}')#每一个epoch都打印一次loss
print('finish!')

VGG

VGG将conv-activate-poolling的模式封装起来,超参数为convNum。而且VGG论文中提及,深层且窄的卷积(即 3×3 )更有效。
这里的train和dataset代码和上一个相同,只给出网络架构的实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
def vggBlock(convNum, inChannel, outChannel):  # 定义vgg里面的一块
blockStruc = []
for _ in range(convNum):
blockStruc.append(nn.Conv2d(in_channels=inChannel, out_channels=outChannel, padding=1, kernel_size=3))
blockStruc.append(nn.ReLU())
inChannel = outChannel
blockStruc.append(nn.MaxPool2d(kernel_size=2, stride=2))
return nn.Sequential(*blockStruc) # 星号表示把列表拆成元素

blockSet = ((1, 16), (1, 32), (2, 64), (2, 128), (2, 128)) # 为了减少计算量,这里整除4

def vggnet(blockSet):
netStruc = []
inChannel = 1
for convNum, outChannel in blockSet:
netStruc.append(vggBlock(convNum, inChannel, outChannel))
inChannel = outChannel

return nn.Sequential(*netStruc,
nn.Flatten(),
nn.Linear(in_features=outChannel*7*7, out_features=4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 10)
)

net = vggnet(blockSet)

NiN

NiN给出的结论是,如果后续用全连接层进行学习,有可能完全放弃表征的空间结构。NiN使用了1*1的卷积,充当逐像素的全连接层,具体类似VGG的块,不过VGG的块是conv-activate-poolling,NiN是Conv-1*1Conv-1*1Conv-Pool(激活在每个Conv后都有,这里不写出)。1*1有时称为bottleneck层,这可以减少计算量。
NiN中最后有一个AdaptiveAvgPool2d,比如说一个5channel的28 * 28图像,经过这个之后就是5 * 1 * 1。这免去了全连接层的大量参数,而全连接层的大量参数有可能减慢计算速度和造成过拟合。不过AdaptiveAvgPool2d有可能导致收敛速度减慢。
NiN的设计事实上很多部分来自AlexNet。结构如下:7O7XSPKD.png

很遗憾,在小批量的数据上NiN的收敛速度的确很慢,我在取5000张fashion_mnist数据集的训练中收敛极慢,没有复现成功。其网络架构为:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def ninBlock(inChannel, outChannel, kernel_size, padding, stride):
return nn.Sequential(nn.Conv2d(in_channels=inChannel, out_channels=outChannel, kernel_size=kernel_size, padding=padding, stride=stride),
nn.ReLU(),
nn.Conv2d(in_channels=outChannel, out_channels=outChannel, kernel_size=1), # 1*1 conv
nn.ReLU(),
nn.Conv2d(in_channels=outChannel, out_channels=outChannel, kernel_size=1), # 1*1 conv
nn.ReLU()
)

net = nn.Sequential(ninBlock(1, 96, 11, 0, 4),
nn.MaxPool2d(kernel_size=3, stride=2),
ninBlock(96, 256, 5, 2, 1),
nn.MaxPool2d(kernel_size=3, stride=2),
ninBlock(256, 384, 3, 1, 1),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Dropout(0.5),
ninBlock(384, 10, 3, 1, 1),
nn.AdaptiveAvgPool2d((1, 1)), # !important
nn.Flatten(),
)

GoogleNet

GoogleNet论文指出,有时候使用不同大小的卷积核是有利的。GoogleNet应用对一层信息的多次conv,类似不同大小的滤波器可以提取到不同的信息,最后拼起来作为输出。一个模块如下:4OPWR07.png
而整个网络架构如下:XFJ4M0XIEVF.png
这里给出简化过的GoogleNet实现。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
class Inception(nn.Module):
# c1--c4是每条路径的输出通道数
def __init__(self, in_channels, c1, c2, c3, c4, **kwargs):
super(Inception, self).__init__(**kwargs)
# 线路1,单1x1卷积层
self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)
# 线路2,1x1卷积层后接3x3卷积层
self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)
# 线路3,1x1卷积层后接5x5卷积层
self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)
# 线路4,3x3最大汇聚层后接1x1卷积层
self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)

def forward(self, x):
p1 = F.relu(self.p1_1(x))
p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
p4 = F.relu(self.p4_2(self.p4_1(x)))
# 在通道维度上连结输出
return torch.cat((p1, p2, p3, p4), dim=1)

b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) # 第一块
b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1),
nn.ReLU(),
nn.Conv2d(64, 192, kernel_size=3, padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) # 第二块
b3 = nn.Sequential(Inception(192, 64, (96, 128), (16, 32), 32),
Inception(256, 128, (128, 192), (32, 96), 64),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) # 第三块
b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64),
Inception(512, 160, (112, 224), (24, 64), 64),
Inception(512, 128, (128, 256), (24, 64), 64),
Inception(512, 112, (144, 288), (32, 64), 64),
Inception(528, 256, (160, 320), (32, 128), 128),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) # 第四块
b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
Inception(832, 384, (192, 384), (48, 128), 128), # 第五块
nn.AdaptiveAvgPool2d((1,1)),
nn.Flatten())

net = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))

ResNet

ResNet直觉上把要拟合的f(x)变换成f(x)-x,这在一定程度上更好实现,如下:N.png
因为ResNet块中,有的时候想要在conv指定大小的变化,如果直接加到后面去维度是不匹配的。所以引入另一种块,即使用1*1conv变化维度,使其可加。5B8CL30MVUX.png
那么ResNet的总架构由这些块构成,如下:Z4.png

实现代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
class Residual(nn.Module):  # 定义块中要用到的元素
def __init__(self, input_channels, num_channels,
use_1x1conv=False, strides=1):
super().__init__()
self.conv1 = nn.Conv2d(input_channels, num_channels,
kernel_size=3, padding=1, stride=strides) # 块中的第一个conv,变化channel和大小(因为stride)
self.conv2 = nn.Conv2d(num_channels, num_channels,
kernel_size=3, padding=1) # 第二个conv,same conv
if use_1x1conv:
self.conv3 = nn.Conv2d(input_channels, num_channels, # 引入1*1的conv
kernel_size=1, stride=strides)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(num_channels) # 指定BN
self.bn2 = nn.BatchNorm2d(num_channels)

def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3:
X = self.conv3(X)
Y += X
return F.relu(Y)

def resnet_block(input_channels, num_channels, num_residuals,
first_block=False):
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(input_channels, num_channels, # 除了第一个block,剩下的block的第一个conv都在减小尺寸
use_1x1conv=True, strides=2))
else:
blk.append(Residual(num_channels, num_channels))
return blk

b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))
net = nn.Sequential(b1, b2, b3, b4, b5,
nn.AdaptiveAvgPool2d((1,1)), # 最后也有这个全局平均
nn.Flatten(), nn.Linear(512, 10))

DenseNet

DenseNet在ResNet的基础上对每个块之间都进行了连接。具体表现为,当经过一个DenseNet块之后,处理的输出会append到输入矩阵中,进行下一个处理。但这样会带来问题,就是输出越来越大。为此,使用过渡层,即1*1conv后接一个AvgPool,对输出规模减小。

此外,DenseNet使用了更新的块顺序。即BN-activate-conv。代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def conv_block(input_channels, num_channels):  # 定义基础conv,下面的DenseBlock要用到
return nn.Sequential(
nn.BatchNorm2d(input_channels),
nn.ReLU(),
nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1)) # conv这里使用3*3 same

class DenseBlock(nn.Module):
def __init__(self, num_convs, input_channels, num_channels):
super(DenseBlock, self).__init__()
layer = []
for i in range(num_convs):
layer.append(conv_block(
num_channels * i + input_channels, num_channels)) # 这里有个加法是因为每次conv完会增加这些outputChannel,见下面forward
self.net = nn.Sequential(*layer)

def forward(self, X):
for blk in self.net:
Y = blk(X)
# 连接通道维度上每个块的输入和输出,从而维度上按顺序增大是越来越往后的卷积层的信息
X = torch.cat((X, Y), dim=1)
return X

def transition_block(input_channels, num_channels): # 定义过渡层
return nn.Sequential(
nn.BatchNorm2d(input_channels), nn.ReLU(),
nn.Conv2d(input_channels, num_channels, kernel_size=1),
nn.AvgPool2d(kernel_size=2, stride=2)) # 过渡层由BN-conv-avgpool构成,不含有activate,也不是maxPool

b1 = nn.Sequential( # 网络第一层
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1))
# num_channels为当前的通道数, 每两个conv之间增长32个channel,而一个block里面设定为4个conv层,经过一个block就增长128channel
num_channels, growth_rate = 64, 32
num_convs_in_dense_blocks = [4, 4, 4, 4]
blks = []
for i, num_convs in enumerate(num_convs_in_dense_blocks):
blks.append(DenseBlock(num_convs, num_channels, growth_rate)) # 网络增加dense块
# 适配上一个稠密块的输出通道数
num_channels += num_convs * growth_rate
# 在稠密块之间添加一个转换层,使通道数量减半
if i != len(num_convs_in_dense_blocks) - 1:
blks.append(transition_block(num_channels, num_channels // 2))
num_channels = num_channels // 2

net = nn.Sequential(
b1, *blks,
nn.BatchNorm2d(num_channels), nn.ReLU(), # 网络最后总通过BN、全局平均池化和FC
nn.AdaptiveMaxPool2d((1, 1)),
nn.Flatten(),
nn.Linear(num_channels, 10))

YOLO复现

YOLO是物体检测的高效实现。给定一张图片,我们可以通过YOLO检测出多个目标并给出目标大小。其具体实现脉络如下:
输入图片->图片通过CNN得到一个特征张量->从张量中得到特别多的目标框->去掉其中概率太小的框->选定最大概率的框并去掉与其交并比太大的框->得到结果

寄了,没做出来,本贴终结。