基于多实例学习和yolov10实现水下目标检测(RUOD)

一、引言

清华大学发了一篇《YOLOv10: Real-Time End-to-End Object Detection》论文,提升了计算机视觉模型的性能,而且响应更快。

于是我结合了百度飞桨paddle开源的水下目标检测Ruod数据集,然后将多实例学习注意力机制结合到yolov10,让它能学习更快,效率更高。接下来是我做的工作:

二、准备工作

(1)运行环境

1
2
3
4
python≥3.9
anaconda
GeForce Nvidia 1080ti
Windows

(2)YOLOv10安装

1
2
3
4
5
6
conda create -n yolo10 python=3.9
conda activate yolo10
git https://github.com/THU-MIG/yolov10.git
cd yolov10-main
pip install -r requirements.txt
pip install -e .

遇到报错:ERROR: No matching distribution found for xxxx.

解决方法,离线安装:

1
pip install -e .  --no-build-isolation --no-index --find-links=./

然后根据缺少的包pip install

(3)RUOD数据集

1)下载:水下目标检测Ruod数据集_数据集-飞桨AI Studio星河社区 ,一共有holothurian,echinus,scallop,starfish,fish,corals,diver,cuttlefish,turtle,jellyfish10个常见类别;

2)数据处理:

数据集要处理从COCO格式,所以我们要做这一步,在yolov10-main目录下新建一个py文件,填入代码并python一下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import json
import glob
import os
import shutil
from pathlib import Path
import numpy as np
from tqdm import tqdm

def make_folders(path='./coco/'):
# Create folders
if os.path.exists(path):
shutil.rmtree(path) # delete output folder
os.makedirs(path) # make new output folder
os.makedirs(path + os.sep + 'labels') # make new labels folder
os.makedirs(path + os.sep + 'images') # make new images folder
return path

def convert_coco_json(json_dir='H:/xiangmu/2/RUOD/RUOD_ANN/'):
jsons = glob.glob(json_dir + '*.json') # Import json
for json_file in sorted(jsons):
fn = 'coco/labels/%s/' % Path(json_file).stem.replace('instances_', '') # folder name
fn_images = 'coco/images/%s/' % Path(json_file).stem.replace('instances_', '') # folder name
os.makedirs(fn, exist_ok=True)
os.makedirs(fn_images, exist_ok=True)

with open(json_file) as f:
data = json.load(f)

print(fn)

# Create image dict
images = {'%g' % x['id']: x for x in data['images']}

# Write labels file
for x in tqdm(data['annotations'], desc='Annotations %s' % json_file):
if x['iscrowd']:
continue
img = images['%g' % x['image_id']]
h, w, f = img['height'], img['width'], img['file_name']
file_path = './RUOD/RUOD_pic/' + fn.split('/')[-2] + "/" + f

# The Labelbox bounding box format is [top left x, top left y, width, height]
box = np.array(x['bbox'], dtype=np.float64)
box[:2] += box[2:] / 2 # xy top-left corner to center
box[[0, 2]] /= w # normalize x
box[[1, 3]] /= h # normalize y

if (box[2] > 0.) and (box[3] > 0.): # if w > 0 and h > 0
with open(fn + Path(f).stem + '.txt', 'a') as file:
file.write('%g %.6f %.6f %.6f %.6f\n' % (x['category_id'] - 1, *box))

file_path_t = fn_images + f
print(file_path, file_path_t)
shutil.copy(file_path, file_path_t)

convert_coco_json()

然后运行之后,在同级目录下新建一个叫coco.yaml的文件,输入内容:

1
2
3
train: ./coco/images/train # train images
val: ./coco/images/test # val images
names: ['holothurian', 'echinus', 'scallop', 'starfish','fish','corals','diver','cuttlefish','turtle','jellyfish']

项目结构如下:

1

3)可以开始训练模型了,同级目录下新建文件main.py,输入:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import torch
import torch.nn as nn
import torch.nn.functional as F
from ultralytics import YOLOv10

class Attention(nn.Module):
def __init__(self):
super(Attention, self).__init__()
self.M = 500
self.L = 128
self.ATTENTION_BRANCHES = 1

self.feature_extractor_part1 = nn.Sequential(
nn.Conv2d(1, 20, kernel_size=5),
nn.ReLU(),
nn.MaxPool2d(2, stride=2),
nn.Conv2d(20, 50, kernel_size=5),
nn.ReLU(),
nn.MaxPool2d(2, stride=2)
)

self.feature_extractor_part2 = nn.Sequential(
nn.Linear(50 * 4 * 4, self.M),
nn.ReLU(),
)

self.attention = nn.Sequential(
nn.Linear(self.M, self.L), # matrix V
nn.Tanh(),
nn.Linear(self.L, self.ATTENTION_BRANCHES) # matrix w (or vector w if self.ATTENTION_BRANCHES==1)
)

self.classifier = nn.Sequential(
nn.Linear(self.M*self.ATTENTION_BRANCHES, 1),
nn.Sigmoid()
)

def forward(self, x):
x = x.squeeze(0)

H = self.feature_extractor_part1(x)
H = H.view(-1, 50 * 4 * 4)
H = self.feature_extractor_part2(H) # KxM

A = self.attention(H) # KxATTENTION_BRANCHES
A = torch.transpose(A, 1, 0) # ATTENTION_BRANCHESxK
A = F.softmax(A, dim=1) # softmax over K

Z = torch.mm(A, H) # ATTENTION_BRANCHESxM

Y_prob = self.classifier(Z)
Y_hat = torch.ge(Y_prob, 0.5).float()

return Y_prob, Y_hat, A

# AUXILIARY METHODS
'''def calculate_classification_error(self, X, Y):
Y = Y.float()
_, Y_hat, _ = self.forward(X)
error = 1. - Y_hat.eq(Y).cpu().float().mean().data.item()

return error, Y_hat

def calculate_objective(self, X, Y):
Y = Y.float()
Y_prob, _, A = self.forward(X)
Y_prob = torch.clamp(Y_prob, min=1e-5, max=1. - 1e-5)
neg_log_likelihood = -1. * (Y * torch.log(Y_prob) + (1. - Y) * torch.log(1. - Y_prob)) # negative log bernoulli

return neg_log_likelihood, A'''

class GatedAttention(nn.Module):
def __init__(self):
super(GatedAttention, self).__init__()
self.M = 500
self.L = 128
self.ATTENTION_BRANCHES = 1

self.feature_extractor_part1 = nn.Sequential(
nn.Conv2d(1, 20, kernel_size=5),
nn.ReLU(),
nn.MaxPool2d(2, stride=2),
nn.Conv2d(20, 50, kernel_size=5),
nn.ReLU(),
nn.MaxPool2d(2, stride=2)
)

self.feature_extractor_part2 = nn.Sequential(
nn.Linear(50 * 4 * 4, self.M),
nn.ReLU(),
)

self.attention_V = nn.Sequential(
nn.Linear(self.M, self.L), # matrix V
nn.Tanh()
)

self.attention_U = nn.Sequential(
nn.Linear(self.M, self.L), # matrix U
nn.Sigmoid()
)

self.attention_w = nn.Linear(self.L, self.ATTENTION_BRANCHES) # matrix w (or vector w if self.ATTENTION_BRANCHES==1)

self.classifier = nn.Sequential(
nn.Linear(self.M*self.ATTENTION_BRANCHES, 1),
nn.Sigmoid()
)

def forward(self, x):
x = x.squeeze(0)

H = self.feature_extractor_part1(x)
H = H.view(-1, 50 * 4 * 4)
H = self.feature_extractor_part2(H) # KxM

A_V = self.attention_V(H) # KxL
A_U = self.attention_U(H) # KxL
A = self.attention_w(A_V * A_U) # element wise multiplication # KxATTENTION_BRANCHES
A = torch.transpose(A, 1, 0) # ATTENTION_BRANCHESxK
A = F.softmax(A, dim=1) # softmax over K

Z = torch.mm(A, H) # ATTENTION_BRANCHESxM

Y_prob = self.classifier(Z)
Y_hat = torch.ge(Y_prob, 0.5).float()

return Y_prob, Y_hat, A

# AUXILIARY METHODS
'''def calculate_classification_error(self, X, Y):
Y = Y.float()
_, Y_hat, _ = self.forward(X)
error = 1. - Y_hat.eq(Y).cpu().float().mean().item()

return error, Y_hat

def calculate_objective(self, X, Y):
Y = Y.float()
Y_prob, _, A = self.forward(X)
Y_prob = torch.clamp(Y_prob, min=1e-5, max=1. - 1e-5)
neg_log_likelihood = -1. * (Y * torch.log(Y_prob) + (1. - Y) * torch.log(1. - Y_prob)) # negative log bernoulli

return neg_log_likelihood, A'''



class MIL_YOLOv10(YOLOv10):
def __init__(self, model_cfg, attention_type='attention'):
super(MIL_YOLOv10, self).__init__(model_cfg)

# 加载预训练权重
self.load('yolov10n.pt')

# 选择注意力机制
if attention_type == 'attention':
self.attention_module = Attention()
elif attention_type == 'gated_attention':
self.attention_module = GatedAttention()
else:
raise ValueError("Unknown attention type")

def forward(self, x):
# 获取 YOLOv10 的特征图
features = super().forward(x) # 使用 YOLOv10 的特征提取

# 将特征图展平并通过 MIL 模块
features = features.view(features.size(0), -1)

# 通过注意力机制处理特征
attention_weights, _, _ = self.attention_module(features)

return attention_weights

if __name__ == '__main__':
# 创建 MIL-YOLOv10 模型
model_cfg = "H:/xiangmu/2/yolov10/ultralytics/cfg/models/v10/yolov10n.yaml"
model = MIL_YOLOv10(model_cfg, attention_type='attention') # 选择 'attention' 或 'gated_attention'

# 训练模型
# 由于 `YOLOv10` 可能会提供高层次的训练接口,具体训练过程请参考 YOLOv10 文档
# 此处假设 YOLOv10 的 `train` 方法可以直接调用
results = model.train(
data="coco.yaml",
patience=0,
epochs=50,
device='0',
batch=16,
seed=42
)

4)验证:

1
2
3
4
5
6
7
8
9
from ultralytics import YOLOv10

if __name__ == '__main__':
# Load a custom model
model = YOLOv10('runs/detect/train/weights/best.pt')

# Validate the model,if test mode turn "split='test'"
metrics = model.val(split='val', save_json=True)

5)测试:(用训练好的模型pt测试其他的数据)

1
2
3
4
5
6
7
8
9
10
11
from ultralytics import YOLOv10

if __name__ == '__main__':
# Load a custom model
model = YOLOv10('runs/detect/train/weights/best.pt')

# Predict on an image
results = model.predict(source="ultralytics/assets", device='0', visualize=True, save=True)

# Print results
print(results)

6)读取特征:

1
2
3
4
#导入所需的包
import numpy as np#导入npy文件路径位置
test = np.load('runs/detect/predict/zidane/stage2_C2f_features.npy')
print(test.shape[0])

三、代码地址

https://github.com/Lucki-ly/yolov10

四、效果展示

(1)训练结果:

2

(2)验证结果:

3

(3)测试结果:

4

5

6

7

8

9

扫一扫,分享到微信

微信分享二维码
  • Copyrights © 2024 John Doe
  • 访问人数: | 浏览次数:

请我喝杯咖啡吧~

支付宝
微信