Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +12 -0
- assets/1.jpg +3 -0
- assets/2.jpg +3 -0
- assets/3.jpg +3 -0
- assets/4.jpg +3 -0
- checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738985521.Iflight +3 -0
- checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995505.Iflight +3 -0
- checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995594.Iflight +3 -0
- checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995658.Iflight +3 -0
- checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742698.autodl-container-10a44fbcf4-a7468946 +3 -0
- checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742996.autodl-container-10a44fbcf4-a7468946 +3 -0
- checkpoints/imagenet/hole_benchmark/gen_00430000.pt +3 -0
- checkpoints/ostracoda_cyclegan/latest_net_D_A.pth +3 -0
- checkpoints/ostracoda_cyclegan/latest_net_D_B.pth +3 -0
- data/style/11.png +3 -0
- data/style/32.jpg +3 -0
- data/style/6.jpg +3 -0
- data/style/7.jpg +3 -0
- data/texture/16.jpg +3 -0
- data/texture/17.jpg +3 -0
- data/texture/4.jpg +3 -0
- data/texture/8.jpg +3 -0
- model/tokenizer/tokenizer_config.json +34 -0
- model/tokenizer/vocab.json +0 -0
- model/unet/config.json +36 -0
- model/vae/config.json +29 -0
- sddfrcnn_model/__pycache__/draw_box_utils.cpython-310.pyc +0 -0
- sddfrcnn_model/backbone/__init__.py +3 -0
- sddfrcnn_model/backbone/__pycache__/__init__.cpython-310.pyc +0 -0
- sddfrcnn_model/backbone/__pycache__/feature_pyramid_network.cpython-310.pyc +0 -0
- sddfrcnn_model/backbone/__pycache__/res50_backbone.cpython-310.pyc +0 -0
- sddfrcnn_model/backbone/__pycache__/resnet50_fpn_model.cpython-310.pyc +0 -0
- sddfrcnn_model/backbone/__pycache__/ssd_model.cpython-310.pyc +0 -0
- sddfrcnn_model/backbone/__pycache__/utils.cpython-310.pyc +0 -0
- sddfrcnn_model/backbone/feature_pyramid_network.py +283 -0
- sddfrcnn_model/backbone/res50_backbone.py +106 -0
- sddfrcnn_model/backbone/resnet50_fpn_model.py +199 -0
- sddfrcnn_model/backbone/ssd_model.py +225 -0
- sddfrcnn_model/backbone/utils.py +628 -0
- sddfrcnn_model/draw_box_utils.py +197 -0
- sddfrcnn_model/network_files/__init__.py +1 -0
- sddfrcnn_model/network_files/__pycache__/__init__.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/__pycache__/anchor_utils.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/__pycache__/boxes.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/__pycache__/det_utils.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/__pycache__/image_list.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/__pycache__/losses.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/__pycache__/retinanet.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/__pycache__/transform.cpython-310.pyc +0 -0
- sddfrcnn_model/network_files/anchor_utils.py +192 -0
.gitattributes
CHANGED
|
@@ -65,3 +65,15 @@ data/style/5.jpg filter=lfs diff=lfs merge=lfs -text
|
|
| 65 |
data/style/59.png filter=lfs diff=lfs merge=lfs -text
|
| 66 |
data/texture/14.jpg filter=lfs diff=lfs merge=lfs -text
|
| 67 |
data/texture/15.jpg filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
data/style/59.png filter=lfs diff=lfs merge=lfs -text
|
| 66 |
data/texture/14.jpg filter=lfs diff=lfs merge=lfs -text
|
| 67 |
data/texture/15.jpg filter=lfs diff=lfs merge=lfs -text
|
| 68 |
+
data/texture/16.jpg filter=lfs diff=lfs merge=lfs -text
|
| 69 |
+
data/texture/4.jpg filter=lfs diff=lfs merge=lfs -text
|
| 70 |
+
data/style/7.jpg filter=lfs diff=lfs merge=lfs -text
|
| 71 |
+
data/style/11.png filter=lfs diff=lfs merge=lfs -text
|
| 72 |
+
data/texture/8.jpg filter=lfs diff=lfs merge=lfs -text
|
| 73 |
+
data/style/6.jpg filter=lfs diff=lfs merge=lfs -text
|
| 74 |
+
data/style/32.jpg filter=lfs diff=lfs merge=lfs -text
|
| 75 |
+
data/texture/17.jpg filter=lfs diff=lfs merge=lfs -text
|
| 76 |
+
assets/2.jpg filter=lfs diff=lfs merge=lfs -text
|
| 77 |
+
assets/3.jpg filter=lfs diff=lfs merge=lfs -text
|
| 78 |
+
assets/4.jpg filter=lfs diff=lfs merge=lfs -text
|
| 79 |
+
assets/1.jpg filter=lfs diff=lfs merge=lfs -text
|
assets/1.jpg
ADDED
|
Git LFS Details
|
assets/2.jpg
ADDED
|
Git LFS Details
|
assets/3.jpg
ADDED
|
Git LFS Details
|
assets/4.jpg
ADDED
|
Git LFS Details
|
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738985521.Iflight
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:970514f929b756e7026179fd443b21eff57e61901c16d3bbd3af81afe0de53dd
|
| 3 |
+
size 40
|
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995505.Iflight
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db215c86a3dfe491ef28a21766d485c84edf70099f98464be9fa1cddd3f4e633
|
| 3 |
+
size 40
|
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995594.Iflight
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bde18faa073f516a72bbdd40bb5e08e5fb8da56914455612b49d9c8d85b3cca8
|
| 3 |
+
size 40
|
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995658.Iflight
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7127eee0799d2360a417549a82c10cc3b12ec09f9015495257ec92e55383894
|
| 3 |
+
size 40
|
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742698.autodl-container-10a44fbcf4-a7468946
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f75095a66582caded1c54a9fa7cc3e7edc13d3e9e17a559929cace1f64f6f7e2
|
| 3 |
+
size 40
|
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742996.autodl-container-10a44fbcf4-a7468946
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79c1f85ed120d2ac696dfa584e7368b9a5eae288f5e6d938ff68788146279d5c
|
| 3 |
+
size 152845
|
checkpoints/imagenet/hole_benchmark/gen_00430000.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ee688f6cf0649a0eeea9c4623719eeab52bf39f2a5f2dabf80cbcf1995f289b3
|
| 3 |
+
size 14443538
|
checkpoints/ostracoda_cyclegan/latest_net_D_A.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0a52376b4c7fdb72089e48a3aa1e9c6f3f26576dba68c01f058643baf4506944
|
| 3 |
+
size 11063002
|
checkpoints/ostracoda_cyclegan/latest_net_D_B.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7894e2f5f98edad44b0eb0202ce5b6f64669a0436099caaf36719ea2a8e963eb
|
| 3 |
+
size 11063002
|
data/style/11.png
ADDED
|
Git LFS Details
|
data/style/32.jpg
ADDED
|
Git LFS Details
|
data/style/6.jpg
ADDED
|
Git LFS Details
|
data/style/7.jpg
ADDED
|
Git LFS Details
|
data/texture/16.jpg
ADDED
|
Git LFS Details
|
data/texture/17.jpg
ADDED
|
Git LFS Details
|
data/texture/4.jpg
ADDED
|
Git LFS Details
|
data/texture/8.jpg
ADDED
|
Git LFS Details
|
model/tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"bos_token": {
|
| 4 |
+
"__type": "AddedToken",
|
| 5 |
+
"content": "<|startoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false
|
| 10 |
+
},
|
| 11 |
+
"do_lower_case": true,
|
| 12 |
+
"eos_token": {
|
| 13 |
+
"__type": "AddedToken",
|
| 14 |
+
"content": "<|endoftext|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": true,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false
|
| 19 |
+
},
|
| 20 |
+
"errors": "replace",
|
| 21 |
+
"model_max_length": 77,
|
| 22 |
+
"name_or_path": "openai/clip-vit-large-patch14",
|
| 23 |
+
"pad_token": "<|endoftext|>",
|
| 24 |
+
"special_tokens_map_file": "./special_tokens_map.json",
|
| 25 |
+
"tokenizer_class": "CLIPTokenizer",
|
| 26 |
+
"unk_token": {
|
| 27 |
+
"__type": "AddedToken",
|
| 28 |
+
"content": "<|endoftext|>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": true,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false
|
| 33 |
+
}
|
| 34 |
+
}
|
model/tokenizer/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model/unet/config.json
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNet2DConditionModel",
|
| 3 |
+
"_diffusers_version": "0.6.0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"attention_head_dim": 8,
|
| 6 |
+
"block_out_channels": [
|
| 7 |
+
320,
|
| 8 |
+
640,
|
| 9 |
+
1280,
|
| 10 |
+
1280
|
| 11 |
+
],
|
| 12 |
+
"center_input_sample": false,
|
| 13 |
+
"cross_attention_dim": 768,
|
| 14 |
+
"down_block_types": [
|
| 15 |
+
"CrossAttnDownBlock2D",
|
| 16 |
+
"CrossAttnDownBlock2D",
|
| 17 |
+
"CrossAttnDownBlock2D",
|
| 18 |
+
"DownBlock2D"
|
| 19 |
+
],
|
| 20 |
+
"downsample_padding": 1,
|
| 21 |
+
"flip_sin_to_cos": true,
|
| 22 |
+
"freq_shift": 0,
|
| 23 |
+
"in_channels": 4,
|
| 24 |
+
"layers_per_block": 2,
|
| 25 |
+
"mid_block_scale_factor": 1,
|
| 26 |
+
"norm_eps": 1e-05,
|
| 27 |
+
"norm_num_groups": 32,
|
| 28 |
+
"out_channels": 4,
|
| 29 |
+
"sample_size": 64,
|
| 30 |
+
"up_block_types": [
|
| 31 |
+
"UpBlock2D",
|
| 32 |
+
"CrossAttnUpBlock2D",
|
| 33 |
+
"CrossAttnUpBlock2D",
|
| 34 |
+
"CrossAttnUpBlock2D"
|
| 35 |
+
]
|
| 36 |
+
}
|
model/vae/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKL",
|
| 3 |
+
"_diffusers_version": "0.6.0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"block_out_channels": [
|
| 6 |
+
128,
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
512
|
| 10 |
+
],
|
| 11 |
+
"down_block_types": [
|
| 12 |
+
"DownEncoderBlock2D",
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D"
|
| 16 |
+
],
|
| 17 |
+
"in_channels": 3,
|
| 18 |
+
"latent_channels": 4,
|
| 19 |
+
"layers_per_block": 2,
|
| 20 |
+
"norm_num_groups": 32,
|
| 21 |
+
"out_channels": 3,
|
| 22 |
+
"sample_size": 512,
|
| 23 |
+
"up_block_types": [
|
| 24 |
+
"UpDecoderBlock2D",
|
| 25 |
+
"UpDecoderBlock2D",
|
| 26 |
+
"UpDecoderBlock2D",
|
| 27 |
+
"UpDecoderBlock2D"
|
| 28 |
+
]
|
| 29 |
+
}
|
sddfrcnn_model/__pycache__/draw_box_utils.cpython-310.pyc
ADDED
|
Binary file (5.2 kB). View file
|
|
|
sddfrcnn_model/backbone/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .feature_pyramid_network import FeaturePyramidNetwork, LastLevelP6P7, LastLevelMaxPool
|
| 2 |
+
from .resnet50_fpn_model import resnet50_fpn_backbone
|
| 3 |
+
from .ssd_model import SSD300,Backbone
|
sddfrcnn_model/backbone/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (407 Bytes). View file
|
|
|
sddfrcnn_model/backbone/__pycache__/feature_pyramid_network.cpython-310.pyc
ADDED
|
Binary file (10.1 kB). View file
|
|
|
sddfrcnn_model/backbone/__pycache__/res50_backbone.cpython-310.pyc
ADDED
|
Binary file (3.27 kB). View file
|
|
|
sddfrcnn_model/backbone/__pycache__/resnet50_fpn_model.cpython-310.pyc
ADDED
|
Binary file (6.26 kB). View file
|
|
|
sddfrcnn_model/backbone/__pycache__/ssd_model.cpython-310.pyc
ADDED
|
Binary file (6.61 kB). View file
|
|
|
sddfrcnn_model/backbone/__pycache__/utils.cpython-310.pyc
ADDED
|
Binary file (14.9 kB). View file
|
|
|
sddfrcnn_model/backbone/feature_pyramid_network.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from collections import OrderedDict
|
| 2 |
+
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch
|
| 5 |
+
from torch import Tensor
|
| 6 |
+
import torch.nn.functional as F
|
| 7 |
+
|
| 8 |
+
from torch.jit.annotations import Tuple, List, Dict
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class IntermediateLayerGetter(nn.ModuleDict):
|
| 12 |
+
"""
|
| 13 |
+
Module wrapper that returns intermediate layers from a model
|
| 14 |
+
It has a strong assumption that the modules have been registered
|
| 15 |
+
into the model in the same order as they are used.
|
| 16 |
+
This means that one should **not** reuse the same nn.Module
|
| 17 |
+
twice in the forward if you want this to work.
|
| 18 |
+
Additionally, it is only able to query submodules that are directly
|
| 19 |
+
assigned to the model. So if `model` is passed, `model.feature1` can
|
| 20 |
+
be returned, but not `model.feature1.layer2`.
|
| 21 |
+
Arguments:
|
| 22 |
+
model (nn.Module): model on which we will extract the features
|
| 23 |
+
return_layers (Dict[name, new_name]): a dict containing the names
|
| 24 |
+
of the modules for which the activations will be returned as
|
| 25 |
+
the key of the dict, and the value of the dict is the name
|
| 26 |
+
of the returned activation (which the user can specify).
|
| 27 |
+
"""
|
| 28 |
+
__annotations__ = {
|
| 29 |
+
"return_layers": Dict[str, str],
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
def __init__(self, model, return_layers):
|
| 33 |
+
if not set(return_layers).issubset([name for name, _ in model.named_children()]):
|
| 34 |
+
raise ValueError("return_layers are not present in model")
|
| 35 |
+
|
| 36 |
+
orig_return_layers = return_layers
|
| 37 |
+
return_layers = {str(k): str(v) for k, v in return_layers.items()}
|
| 38 |
+
layers = OrderedDict()
|
| 39 |
+
|
| 40 |
+
# 遍历模型子模块按顺序存入有序字典
|
| 41 |
+
# 只保存layer4及其之前的结构,舍去之后不用的结构
|
| 42 |
+
for name, module in model.named_children():
|
| 43 |
+
layers[name] = module
|
| 44 |
+
if name in return_layers:
|
| 45 |
+
del return_layers[name]
|
| 46 |
+
if not return_layers:
|
| 47 |
+
break
|
| 48 |
+
|
| 49 |
+
super().__init__(layers)
|
| 50 |
+
self.return_layers = orig_return_layers
|
| 51 |
+
|
| 52 |
+
def forward(self, x):
|
| 53 |
+
out = OrderedDict()
|
| 54 |
+
# 依次遍历模型的所有子模块,并进行正向传播,
|
| 55 |
+
# 收集layer1, layer2, layer3, layer4的输出
|
| 56 |
+
for name, module in self.items():
|
| 57 |
+
x = module(x)
|
| 58 |
+
if name in self.return_layers:
|
| 59 |
+
out_name = self.return_layers[name]
|
| 60 |
+
out[out_name] = x
|
| 61 |
+
return out
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
class BackboneWithFPN(nn.Module):
|
| 65 |
+
"""
|
| 66 |
+
Adds a FPN on top of a model.
|
| 67 |
+
Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
|
| 68 |
+
extract a submodel that returns the feature maps specified in return_layers.
|
| 69 |
+
The same limitations of IntermediatLayerGetter apply here.
|
| 70 |
+
Arguments:
|
| 71 |
+
backbone (nn.Module)
|
| 72 |
+
return_layers (Dict[name, new_name]): a dict containing the names
|
| 73 |
+
of the modules for which the activations will be returned as
|
| 74 |
+
the key of the dict, and the value of the dict is the name
|
| 75 |
+
of the returned activation (which the user can specify).
|
| 76 |
+
in_channels_list (List[int]): number of channels for each feature map
|
| 77 |
+
that is returned, in the order they are present in the OrderedDict
|
| 78 |
+
out_channels (int): number of channels in the FPN.
|
| 79 |
+
extra_blocks: ExtraFPNBlock
|
| 80 |
+
Attributes:
|
| 81 |
+
out_channels (int): the number of channels in the FPN
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
def __init__(self,
|
| 85 |
+
backbone: nn.Module,
|
| 86 |
+
return_layers=None,
|
| 87 |
+
in_channels_list=None,
|
| 88 |
+
out_channels=256,
|
| 89 |
+
extra_blocks=None,
|
| 90 |
+
re_getter=True):
|
| 91 |
+
super().__init__()
|
| 92 |
+
|
| 93 |
+
if extra_blocks is None:
|
| 94 |
+
extra_blocks = LastLevelMaxPool()
|
| 95 |
+
|
| 96 |
+
if re_getter:
|
| 97 |
+
assert return_layers is not None
|
| 98 |
+
self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
|
| 99 |
+
else:
|
| 100 |
+
self.body = backbone
|
| 101 |
+
|
| 102 |
+
self.fpn = FeaturePyramidNetwork(
|
| 103 |
+
in_channels_list=in_channels_list,
|
| 104 |
+
out_channels=out_channels,
|
| 105 |
+
extra_blocks=extra_blocks,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
self.out_channels = out_channels
|
| 109 |
+
|
| 110 |
+
def forward(self, x):
|
| 111 |
+
x = self.body(x)
|
| 112 |
+
x = self.fpn(x)
|
| 113 |
+
return x
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class ExtraFPNBlock(nn.Module):
|
| 117 |
+
"""
|
| 118 |
+
Base class for the extra block in the FPN.
|
| 119 |
+
|
| 120 |
+
Args:
|
| 121 |
+
results (List[Tensor]): the result of the FPN
|
| 122 |
+
x (List[Tensor]): the original feature maps
|
| 123 |
+
names (List[str]): the names for each one of the
|
| 124 |
+
original feature maps
|
| 125 |
+
|
| 126 |
+
Returns:
|
| 127 |
+
results (List[Tensor]): the extended set of results
|
| 128 |
+
of the FPN
|
| 129 |
+
names (List[str]): the extended set of names for the results
|
| 130 |
+
"""
|
| 131 |
+
def forward(self,
|
| 132 |
+
results: List[Tensor],
|
| 133 |
+
x: List[Tensor],
|
| 134 |
+
names: List[str]) -> Tuple[List[Tensor], List[str]]:
|
| 135 |
+
pass
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
class LastLevelMaxPool(torch.nn.Module):
|
| 139 |
+
"""
|
| 140 |
+
Applies a max_pool2d on top of the last feature map
|
| 141 |
+
"""
|
| 142 |
+
|
| 143 |
+
def forward(self, x: List[Tensor], y: List[Tensor], names: List[str]) -> Tuple[List[Tensor], List[str]]:
|
| 144 |
+
names.append("pool")
|
| 145 |
+
x.append(F.max_pool2d(x[-1], 1, 2, 0))
|
| 146 |
+
return x, names
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class LastLevelP6P7(ExtraFPNBlock):
|
| 150 |
+
"""
|
| 151 |
+
This module is used in RetinaNet to generate extra layers, P6 and P7.
|
| 152 |
+
"""
|
| 153 |
+
def __init__(self, in_channels: int, out_channels: int):
|
| 154 |
+
super().__init__()
|
| 155 |
+
self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
|
| 156 |
+
self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
|
| 157 |
+
for module in [self.p6, self.p7]:
|
| 158 |
+
nn.init.kaiming_uniform_(module.weight, a=1)
|
| 159 |
+
nn.init.constant_(module.bias, 0)
|
| 160 |
+
self.use_P5 = in_channels == out_channels
|
| 161 |
+
|
| 162 |
+
def forward(self,
|
| 163 |
+
p: List[Tensor],
|
| 164 |
+
c: List[Tensor],
|
| 165 |
+
names: List[str]) -> Tuple[List[Tensor], List[str]]:
|
| 166 |
+
p5, c5 = p[-1], c[-1]
|
| 167 |
+
x = p5 if self.use_P5 else c5
|
| 168 |
+
p6 = self.p6(x)
|
| 169 |
+
p7 = self.p7(F.relu(p6))
|
| 170 |
+
p.extend([p6, p7])
|
| 171 |
+
names.extend(["p6", "p7"])
|
| 172 |
+
return p, names
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
class FeaturePyramidNetwork(nn.Module):
|
| 176 |
+
"""
|
| 177 |
+
Module that adds a FPN from on top of a set of feature maps. This is based on
|
| 178 |
+
`"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
|
| 179 |
+
The feature maps are currently supposed to be in increasing depth
|
| 180 |
+
order.
|
| 181 |
+
The input to the model is expected to be an OrderedDict[Tensor], containing
|
| 182 |
+
the feature maps on top of which the FPN will be added.
|
| 183 |
+
Arguments:
|
| 184 |
+
in_channels_list (list[int]): number of channels for each feature map that
|
| 185 |
+
is passed to the module
|
| 186 |
+
out_channels (int): number of channels of the FPN representation
|
| 187 |
+
extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
|
| 188 |
+
be performed. It is expected to take the fpn features, the original
|
| 189 |
+
features and the names of the original features as input, and returns
|
| 190 |
+
a new list of feature maps and their corresponding names
|
| 191 |
+
"""
|
| 192 |
+
|
| 193 |
+
def __init__(self, in_channels_list, out_channels, extra_blocks=None):
|
| 194 |
+
super().__init__()
|
| 195 |
+
# 用来调整resnet特征矩阵(layer1,2,3,4)的channel(kernel_size=1)
|
| 196 |
+
self.inner_blocks = nn.ModuleList()
|
| 197 |
+
# 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵
|
| 198 |
+
self.layer_blocks = nn.ModuleList()
|
| 199 |
+
for in_channels in in_channels_list:
|
| 200 |
+
if in_channels == 0:
|
| 201 |
+
continue
|
| 202 |
+
inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
|
| 203 |
+
layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
|
| 204 |
+
self.inner_blocks.append(inner_block_module)
|
| 205 |
+
self.layer_blocks.append(layer_block_module)
|
| 206 |
+
|
| 207 |
+
# initialize parameters now to avoid modifying the initialization of top_blocks
|
| 208 |
+
for m in self.children():
|
| 209 |
+
if isinstance(m, nn.Conv2d):
|
| 210 |
+
nn.init.kaiming_uniform_(m.weight, a=1)
|
| 211 |
+
nn.init.constant_(m.bias, 0)
|
| 212 |
+
|
| 213 |
+
self.extra_blocks = extra_blocks
|
| 214 |
+
|
| 215 |
+
def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
|
| 216 |
+
"""
|
| 217 |
+
This is equivalent to self.inner_blocks[idx](x),
|
| 218 |
+
but torchscript doesn't support this yet
|
| 219 |
+
"""
|
| 220 |
+
num_blocks = len(self.inner_blocks)
|
| 221 |
+
if idx < 0:
|
| 222 |
+
idx += num_blocks
|
| 223 |
+
i = 0
|
| 224 |
+
out = x
|
| 225 |
+
for module in self.inner_blocks:
|
| 226 |
+
if i == idx:
|
| 227 |
+
out = module(x)
|
| 228 |
+
i += 1
|
| 229 |
+
return out
|
| 230 |
+
|
| 231 |
+
def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
|
| 232 |
+
"""
|
| 233 |
+
This is equivalent to self.layer_blocks[idx](x),
|
| 234 |
+
but torchscript doesn't support this yet
|
| 235 |
+
"""
|
| 236 |
+
num_blocks = len(self.layer_blocks)
|
| 237 |
+
if idx < 0:
|
| 238 |
+
idx += num_blocks
|
| 239 |
+
i = 0
|
| 240 |
+
out = x
|
| 241 |
+
for module in self.layer_blocks:
|
| 242 |
+
if i == idx:
|
| 243 |
+
out = module(x)
|
| 244 |
+
i += 1
|
| 245 |
+
return out
|
| 246 |
+
|
| 247 |
+
def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
| 248 |
+
"""
|
| 249 |
+
Computes the FPN for a set of feature maps.
|
| 250 |
+
Arguments:
|
| 251 |
+
x (OrderedDict[Tensor]): feature maps for each feature level.
|
| 252 |
+
Returns:
|
| 253 |
+
results (OrderedDict[Tensor]): feature maps after FPN layers.
|
| 254 |
+
They are ordered from highest resolution first.
|
| 255 |
+
"""
|
| 256 |
+
# unpack OrderedDict into two lists for easier handling
|
| 257 |
+
names = list(x.keys())
|
| 258 |
+
x = list(x.values())
|
| 259 |
+
|
| 260 |
+
# 将resnet layer4的channel调整到指定的out_channels
|
| 261 |
+
# last_inner = self.inner_blocks[-1](x[-1])
|
| 262 |
+
last_inner = self.get_result_from_inner_blocks(x[-1], -1)
|
| 263 |
+
# result中保存着每个预测特征层
|
| 264 |
+
results = []
|
| 265 |
+
# 将layer4调整channel后的特征矩阵,通过3x3卷积后得到对应的预测特征矩阵
|
| 266 |
+
# results.append(self.layer_blocks[-1](last_inner))
|
| 267 |
+
results.append(self.get_result_from_layer_blocks(last_inner, -1))
|
| 268 |
+
|
| 269 |
+
for idx in range(len(x) - 2, -1, -1):
|
| 270 |
+
inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
|
| 271 |
+
feat_shape = inner_lateral.shape[-2:]
|
| 272 |
+
inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
|
| 273 |
+
last_inner = inner_lateral + inner_top_down
|
| 274 |
+
results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
|
| 275 |
+
|
| 276 |
+
# 在layer4对应的预测特征层基础上生成预测特征矩阵5
|
| 277 |
+
if self.extra_blocks is not None:
|
| 278 |
+
results, names = self.extra_blocks(results, x, names)
|
| 279 |
+
|
| 280 |
+
# make it back an OrderedDict
|
| 281 |
+
out = OrderedDict([(k, v) for k, v in zip(names, results)])
|
| 282 |
+
|
| 283 |
+
return out
|
sddfrcnn_model/backbone/res50_backbone.py
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch.nn as nn
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Bottleneck(nn.Module):
|
| 6 |
+
expansion = 4
|
| 7 |
+
|
| 8 |
+
def __init__(self, in_channel, out_channel, stride=1, downsample=None):
|
| 9 |
+
super(Bottleneck, self).__init__()
|
| 10 |
+
self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
|
| 11 |
+
kernel_size=1, stride=1, bias=False) # squeeze channels
|
| 12 |
+
self.bn1 = nn.BatchNorm2d(out_channel)
|
| 13 |
+
# -----------------------------------------
|
| 14 |
+
self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
|
| 15 |
+
kernel_size=3, stride=stride, bias=False, padding=1)
|
| 16 |
+
self.bn2 = nn.BatchNorm2d(out_channel)
|
| 17 |
+
# -----------------------------------------
|
| 18 |
+
self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel*self.expansion,
|
| 19 |
+
kernel_size=1, stride=1, bias=False) # unsqueeze channels
|
| 20 |
+
self.bn3 = nn.BatchNorm2d(out_channel*self.expansion)
|
| 21 |
+
self.relu = nn.ReLU(inplace=True)
|
| 22 |
+
self.downsample = downsample
|
| 23 |
+
|
| 24 |
+
def forward(self, x):
|
| 25 |
+
identity = x
|
| 26 |
+
if self.downsample is not None:
|
| 27 |
+
identity = self.downsample(x)
|
| 28 |
+
|
| 29 |
+
out = self.conv1(x)
|
| 30 |
+
out = self.bn1(out)
|
| 31 |
+
out = self.relu(out)
|
| 32 |
+
|
| 33 |
+
out = self.conv2(out)
|
| 34 |
+
out = self.bn2(out)
|
| 35 |
+
out = self.relu(out)
|
| 36 |
+
|
| 37 |
+
out = self.conv3(out)
|
| 38 |
+
out = self.bn3(out)
|
| 39 |
+
|
| 40 |
+
out += identity
|
| 41 |
+
out = self.relu(out)
|
| 42 |
+
|
| 43 |
+
return out
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
class ResNet(nn.Module):
|
| 47 |
+
|
| 48 |
+
def __init__(self, block, blocks_num, num_classes=1000, include_top=True):
|
| 49 |
+
super(ResNet, self).__init__()
|
| 50 |
+
self.include_top = include_top
|
| 51 |
+
self.in_channel = 64
|
| 52 |
+
|
| 53 |
+
self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
|
| 54 |
+
padding=3, bias=False)
|
| 55 |
+
self.bn1 = nn.BatchNorm2d(self.in_channel)
|
| 56 |
+
self.relu = nn.ReLU(inplace=True)
|
| 57 |
+
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
| 58 |
+
self.layer1 = self._make_layer(block, 64, blocks_num[0])
|
| 59 |
+
self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
|
| 60 |
+
self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
|
| 61 |
+
self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
|
| 62 |
+
if self.include_top:
|
| 63 |
+
self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1)
|
| 64 |
+
self.fc = nn.Linear(512 * block.expansion, num_classes)
|
| 65 |
+
|
| 66 |
+
for m in self.modules():
|
| 67 |
+
if isinstance(m, nn.Conv2d):
|
| 68 |
+
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
| 69 |
+
|
| 70 |
+
def _make_layer(self, block, channel, block_num, stride=1):
|
| 71 |
+
downsample = None
|
| 72 |
+
if stride != 1 or self.in_channel != channel * block.expansion:
|
| 73 |
+
downsample = nn.Sequential(
|
| 74 |
+
nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
|
| 75 |
+
nn.BatchNorm2d(channel * block.expansion))
|
| 76 |
+
|
| 77 |
+
layers = []
|
| 78 |
+
layers.append(block(self.in_channel, channel, downsample=downsample, stride=stride))
|
| 79 |
+
self.in_channel = channel * block.expansion
|
| 80 |
+
|
| 81 |
+
for _ in range(1, block_num):
|
| 82 |
+
layers.append(block(self.in_channel, channel))
|
| 83 |
+
|
| 84 |
+
return nn.Sequential(*layers)
|
| 85 |
+
|
| 86 |
+
def forward(self, x):
|
| 87 |
+
x = self.conv1(x)
|
| 88 |
+
x = self.bn1(x)
|
| 89 |
+
x = self.relu(x)
|
| 90 |
+
x = self.maxpool(x)
|
| 91 |
+
|
| 92 |
+
x = self.layer1(x)
|
| 93 |
+
x = self.layer2(x)
|
| 94 |
+
x = self.layer3(x)
|
| 95 |
+
x = self.layer4(x)
|
| 96 |
+
|
| 97 |
+
if self.include_top:
|
| 98 |
+
x = self.avgpool(x)
|
| 99 |
+
x = torch.flatten(x, 1)
|
| 100 |
+
x = self.fc(x)
|
| 101 |
+
|
| 102 |
+
return x
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def resnet50(num_classes=1000, include_top=True):
|
| 106 |
+
return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, include_top=include_top)
|
sddfrcnn_model/backbone/resnet50_fpn_model.py
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch
|
| 5 |
+
from torchvision.ops.misc import FrozenBatchNorm2d
|
| 6 |
+
|
| 7 |
+
from .feature_pyramid_network import LastLevelMaxPool, BackboneWithFPN
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class Bottleneck(nn.Module):
|
| 11 |
+
expansion = 4
|
| 12 |
+
|
| 13 |
+
def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
|
| 14 |
+
super().__init__()
|
| 15 |
+
if norm_layer is None:
|
| 16 |
+
norm_layer = nn.BatchNorm2d
|
| 17 |
+
|
| 18 |
+
self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
|
| 19 |
+
kernel_size=1, stride=1, bias=False) # squeeze channels
|
| 20 |
+
self.bn1 = norm_layer(out_channel)
|
| 21 |
+
# -----------------------------------------
|
| 22 |
+
self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
|
| 23 |
+
kernel_size=3, stride=stride, bias=False, padding=1)
|
| 24 |
+
self.bn2 = norm_layer(out_channel)
|
| 25 |
+
# -----------------------------------------
|
| 26 |
+
self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion,
|
| 27 |
+
kernel_size=1, stride=1, bias=False) # unsqueeze channels
|
| 28 |
+
self.bn3 = norm_layer(out_channel * self.expansion)
|
| 29 |
+
self.relu = nn.ReLU(inplace=True)
|
| 30 |
+
self.downsample = downsample
|
| 31 |
+
|
| 32 |
+
def forward(self, x):
|
| 33 |
+
identity = x
|
| 34 |
+
if self.downsample is not None:
|
| 35 |
+
identity = self.downsample(x)
|
| 36 |
+
|
| 37 |
+
out = self.conv1(x)
|
| 38 |
+
out = self.bn1(out)
|
| 39 |
+
out = self.relu(out)
|
| 40 |
+
|
| 41 |
+
out = self.conv2(out)
|
| 42 |
+
out = self.bn2(out)
|
| 43 |
+
out = self.relu(out)
|
| 44 |
+
|
| 45 |
+
out = self.conv3(out)
|
| 46 |
+
out = self.bn3(out)
|
| 47 |
+
|
| 48 |
+
out += identity
|
| 49 |
+
out = self.relu(out)
|
| 50 |
+
|
| 51 |
+
return out
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class ResNet(nn.Module):
|
| 55 |
+
|
| 56 |
+
def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
|
| 57 |
+
super().__init__()
|
| 58 |
+
if norm_layer is None:
|
| 59 |
+
norm_layer = nn.BatchNorm2d
|
| 60 |
+
self._norm_layer = norm_layer
|
| 61 |
+
|
| 62 |
+
self.include_top = include_top
|
| 63 |
+
self.in_channel = 64
|
| 64 |
+
|
| 65 |
+
self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
|
| 66 |
+
padding=3, bias=False)
|
| 67 |
+
self.bn1 = norm_layer(self.in_channel)
|
| 68 |
+
self.relu = nn.ReLU(inplace=True)
|
| 69 |
+
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
| 70 |
+
self.layer1 = self._make_layer(block, 64, blocks_num[0])
|
| 71 |
+
self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
|
| 72 |
+
self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
|
| 73 |
+
self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
|
| 74 |
+
if self.include_top:
|
| 75 |
+
self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1)
|
| 76 |
+
self.fc = nn.Linear(512 * block.expansion, num_classes)
|
| 77 |
+
|
| 78 |
+
for m in self.modules():
|
| 79 |
+
if isinstance(m, nn.Conv2d):
|
| 80 |
+
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
|
| 81 |
+
|
| 82 |
+
def _make_layer(self, block, channel, block_num, stride=1):
|
| 83 |
+
norm_layer = self._norm_layer
|
| 84 |
+
downsample = None
|
| 85 |
+
if stride != 1 or self.in_channel != channel * block.expansion:
|
| 86 |
+
downsample = nn.Sequential(
|
| 87 |
+
nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
|
| 88 |
+
norm_layer(channel * block.expansion))
|
| 89 |
+
|
| 90 |
+
layers = []
|
| 91 |
+
layers.append(block(self.in_channel, channel, downsample=downsample,
|
| 92 |
+
stride=stride, norm_layer=norm_layer))
|
| 93 |
+
self.in_channel = channel * block.expansion
|
| 94 |
+
|
| 95 |
+
for _ in range(1, block_num):
|
| 96 |
+
layers.append(block(self.in_channel, channel, norm_layer=norm_layer))
|
| 97 |
+
|
| 98 |
+
return nn.Sequential(*layers)
|
| 99 |
+
|
| 100 |
+
def forward(self, x):
|
| 101 |
+
x = self.conv1(x)
|
| 102 |
+
x = self.bn1(x)
|
| 103 |
+
x = self.relu(x)
|
| 104 |
+
x = self.maxpool(x)
|
| 105 |
+
|
| 106 |
+
x = self.layer1(x)
|
| 107 |
+
x = self.layer2(x)
|
| 108 |
+
x = self.layer3(x)
|
| 109 |
+
x = self.layer4(x)
|
| 110 |
+
|
| 111 |
+
if self.include_top:
|
| 112 |
+
x = self.avgpool(x)
|
| 113 |
+
x = torch.flatten(x, 1)
|
| 114 |
+
x = self.fc(x)
|
| 115 |
+
|
| 116 |
+
return x
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def overwrite_eps(model, eps):
|
| 120 |
+
"""
|
| 121 |
+
This method overwrites the default eps values of all the
|
| 122 |
+
FrozenBatchNorm2d layers of the model with the provided value.
|
| 123 |
+
This is necessary to address the BC-breaking change introduced
|
| 124 |
+
by the bug-fix at pytorch/vision#2933. The overwrite is applied
|
| 125 |
+
only when the pretrained weights are loaded to maintain compatibility
|
| 126 |
+
with previous versions.
|
| 127 |
+
|
| 128 |
+
Args:
|
| 129 |
+
model (nn.Module): The model on which we perform the overwrite.
|
| 130 |
+
eps (float): The new value of eps.
|
| 131 |
+
"""
|
| 132 |
+
for module in model.modules():
|
| 133 |
+
if isinstance(module, FrozenBatchNorm2d):
|
| 134 |
+
module.eps = eps
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def resnet50_fpn_backbone(pretrain_path="",
|
| 138 |
+
norm_layer=FrozenBatchNorm2d, # FrozenBatchNorm2d的功能与BatchNorm2d类似,但参数无法更新
|
| 139 |
+
trainable_layers=3,
|
| 140 |
+
returned_layers=None,
|
| 141 |
+
extra_blocks=None):
|
| 142 |
+
"""
|
| 143 |
+
搭建resnet50_fpn——backbone
|
| 144 |
+
Args:
|
| 145 |
+
pretrain_path: resnet50的预训练权重,如果不使用就默认为空
|
| 146 |
+
norm_layer: 官方默认的是FrozenBatchNorm2d,即不会更新参数的bn层(因为如果batch_size设置的很小会导致效果更差,还不如不用bn层)
|
| 147 |
+
如果自己的GPU显存很大可以设置很大的batch_size,那么自己可以传入正常的BatchNorm2d层
|
| 148 |
+
(https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
|
| 149 |
+
trainable_layers: 指定训练哪些层结构
|
| 150 |
+
returned_layers: 指定哪些层的输出需要返回
|
| 151 |
+
extra_blocks: 在输出的特征层基础上额外添加的层结构
|
| 152 |
+
|
| 153 |
+
Returns:
|
| 154 |
+
|
| 155 |
+
"""
|
| 156 |
+
resnet_backbone = ResNet(Bottleneck, [3, 4, 6, 3],
|
| 157 |
+
include_top=False,
|
| 158 |
+
norm_layer=norm_layer)
|
| 159 |
+
|
| 160 |
+
if isinstance(norm_layer, FrozenBatchNorm2d):
|
| 161 |
+
overwrite_eps(resnet_backbone, 0.0)
|
| 162 |
+
|
| 163 |
+
if pretrain_path != "":
|
| 164 |
+
assert os.path.exists(pretrain_path), "{} is not exist.".format(pretrain_path)
|
| 165 |
+
# 载入预训练权重
|
| 166 |
+
print(resnet_backbone.load_state_dict(torch.load(pretrain_path), strict=False))
|
| 167 |
+
|
| 168 |
+
# select layers that wont be frozen
|
| 169 |
+
assert 0 <= trainable_layers <= 5
|
| 170 |
+
layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers]
|
| 171 |
+
|
| 172 |
+
# 如果要训练所有层结构的话,不要忘了conv1后还有一个bn1
|
| 173 |
+
if trainable_layers == 5:
|
| 174 |
+
layers_to_train.append("bn1")
|
| 175 |
+
|
| 176 |
+
# freeze layers
|
| 177 |
+
for name, parameter in resnet_backbone.named_parameters():
|
| 178 |
+
# 只训练不在layers_to_train列表中的层结构
|
| 179 |
+
if all([not name.startswith(layer) for layer in layers_to_train]):
|
| 180 |
+
parameter.requires_grad_(False)
|
| 181 |
+
|
| 182 |
+
if extra_blocks is None:
|
| 183 |
+
extra_blocks = LastLevelMaxPool()
|
| 184 |
+
|
| 185 |
+
if returned_layers is None:
|
| 186 |
+
returned_layers = [1, 2, 3, 4]
|
| 187 |
+
# 返回的特征层个数肯定大于0小于5
|
| 188 |
+
assert min(returned_layers) > 0 and max(returned_layers) < 5
|
| 189 |
+
|
| 190 |
+
# return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
|
| 191 |
+
return_layers = {f'layer{k}': str(v) for v, k in enumerate(returned_layers)}
|
| 192 |
+
|
| 193 |
+
# in_channel 为layer4的输出特征矩阵channel = 2048
|
| 194 |
+
in_channels_stage2 = resnet_backbone.in_channel // 8 # 256
|
| 195 |
+
# 记录resnet50提供给fpn的特征层channels
|
| 196 |
+
in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
|
| 197 |
+
# 通过fpn后得到的每个特征层的channel
|
| 198 |
+
out_channels = 256
|
| 199 |
+
return BackboneWithFPN(resnet_backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)
|
sddfrcnn_model/backbone/ssd_model.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch import nn, Tensor
|
| 3 |
+
from torch.jit.annotations import List
|
| 4 |
+
|
| 5 |
+
from .res50_backbone import resnet50
|
| 6 |
+
from .utils import dboxes300_coco, Encoder, PostProcess
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Backbone(nn.Module):
|
| 10 |
+
def __init__(self, pretrain_path=None):
|
| 11 |
+
super(Backbone, self).__init__()
|
| 12 |
+
net = resnet50()
|
| 13 |
+
self.out_channels = [1024, 512, 512, 256, 256, 256]
|
| 14 |
+
|
| 15 |
+
if pretrain_path is not None:
|
| 16 |
+
net.load_state_dict(torch.load(pretrain_path))
|
| 17 |
+
|
| 18 |
+
self.feature_extractor = nn.Sequential(*list(net.children())[:7])
|
| 19 |
+
|
| 20 |
+
conv4_block1 = self.feature_extractor[-1][0]
|
| 21 |
+
|
| 22 |
+
# 修改conv4_block1的步距,从2->1
|
| 23 |
+
conv4_block1.conv1.stride = (1, 1)
|
| 24 |
+
conv4_block1.conv2.stride = (1, 1)
|
| 25 |
+
conv4_block1.downsample[0].stride = (1, 1)
|
| 26 |
+
|
| 27 |
+
def forward(self, x):
|
| 28 |
+
x = self.feature_extractor(x)
|
| 29 |
+
return x
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class SSD300(nn.Module):
|
| 33 |
+
def __init__(self, backbone=None, num_classes=21):
|
| 34 |
+
super(SSD300, self).__init__()
|
| 35 |
+
if backbone is None:
|
| 36 |
+
raise Exception("backbone is None")
|
| 37 |
+
if not hasattr(backbone, "out_channels"):
|
| 38 |
+
raise Exception("the backbone not has attribute: out_channel")
|
| 39 |
+
self.feature_extractor = backbone
|
| 40 |
+
|
| 41 |
+
self.num_classes = num_classes
|
| 42 |
+
# out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
|
| 43 |
+
self._build_additional_features(self.feature_extractor.out_channels)
|
| 44 |
+
self.num_defaults = [4, 6, 6, 6, 4, 4]
|
| 45 |
+
location_extractors = []
|
| 46 |
+
confidence_extractors = []
|
| 47 |
+
|
| 48 |
+
# out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
|
| 49 |
+
for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
|
| 50 |
+
# nd is number_default_boxes, oc is output_channel
|
| 51 |
+
location_extractors.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
|
| 52 |
+
confidence_extractors.append(nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))
|
| 53 |
+
|
| 54 |
+
self.loc = nn.ModuleList(location_extractors)
|
| 55 |
+
self.conf = nn.ModuleList(confidence_extractors)
|
| 56 |
+
self._init_weights()
|
| 57 |
+
|
| 58 |
+
default_box = dboxes300_coco()
|
| 59 |
+
self.compute_loss = Loss(default_box)
|
| 60 |
+
self.encoder = Encoder(default_box)
|
| 61 |
+
self.postprocess = PostProcess(default_box)
|
| 62 |
+
|
| 63 |
+
def _build_additional_features(self, input_size):
|
| 64 |
+
"""
|
| 65 |
+
为backbone(resnet50)添加额外的一系列卷积层,得到相应的一系列特征提取器
|
| 66 |
+
:param input_size:
|
| 67 |
+
:return:
|
| 68 |
+
"""
|
| 69 |
+
additional_blocks = []
|
| 70 |
+
# input_size = [1024, 512, 512, 256, 256, 256] for resnet50
|
| 71 |
+
middle_channels = [256, 256, 128, 128, 128]
|
| 72 |
+
for i, (input_ch, output_ch, middle_ch) in enumerate(zip(input_size[:-1], input_size[1:], middle_channels)):
|
| 73 |
+
padding, stride = (1, 2) if i < 3 else (0, 1)
|
| 74 |
+
layer = nn.Sequential(
|
| 75 |
+
nn.Conv2d(input_ch, middle_ch, kernel_size=1, bias=False),
|
| 76 |
+
nn.BatchNorm2d(middle_ch),
|
| 77 |
+
nn.ReLU(inplace=True),
|
| 78 |
+
nn.Conv2d(middle_ch, output_ch, kernel_size=3, padding=padding, stride=stride, bias=False),
|
| 79 |
+
nn.BatchNorm2d(output_ch),
|
| 80 |
+
nn.ReLU(inplace=True),
|
| 81 |
+
)
|
| 82 |
+
additional_blocks.append(layer)
|
| 83 |
+
self.additional_blocks = nn.ModuleList(additional_blocks)
|
| 84 |
+
|
| 85 |
+
def _init_weights(self):
|
| 86 |
+
layers = [*self.additional_blocks, *self.loc, *self.conf]
|
| 87 |
+
for layer in layers:
|
| 88 |
+
for param in layer.parameters():
|
| 89 |
+
if param.dim() > 1:
|
| 90 |
+
nn.init.xavier_uniform_(param)
|
| 91 |
+
|
| 92 |
+
# Shape the classifier to the view of bboxes
|
| 93 |
+
def bbox_view(self, features, loc_extractor, conf_extractor):
|
| 94 |
+
locs = []
|
| 95 |
+
confs = []
|
| 96 |
+
for f, l, c in zip(features, loc_extractor, conf_extractor):
|
| 97 |
+
# [batch, n*4, feat_size, feat_size] -> [batch, 4, -1]
|
| 98 |
+
locs.append(l(f).view(f.size(0), 4, -1))
|
| 99 |
+
# [batch, n*classes, feat_size, feat_size] -> [batch, classes, -1]
|
| 100 |
+
confs.append(c(f).view(f.size(0), self.num_classes, -1))
|
| 101 |
+
|
| 102 |
+
locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
|
| 103 |
+
return locs, confs
|
| 104 |
+
|
| 105 |
+
def forward(self, image, targets=None):
|
| 106 |
+
x = self.feature_extractor(image)
|
| 107 |
+
|
| 108 |
+
# Feature Map 38x38x1024, 19x19x512, 10x10x512, 5x5x256, 3x3x256, 1x1x256
|
| 109 |
+
detection_features = torch.jit.annotate(List[Tensor], []) # [x]
|
| 110 |
+
detection_features.append(x)
|
| 111 |
+
for layer in self.additional_blocks:
|
| 112 |
+
x = layer(x)
|
| 113 |
+
detection_features.append(x)
|
| 114 |
+
|
| 115 |
+
# Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
|
| 116 |
+
locs, confs = self.bbox_view(detection_features, self.loc, self.conf)
|
| 117 |
+
|
| 118 |
+
# For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
|
| 119 |
+
# 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732
|
| 120 |
+
|
| 121 |
+
if self.training:
|
| 122 |
+
if targets is None:
|
| 123 |
+
raise ValueError("In training mode, targets should be passed")
|
| 124 |
+
# bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
|
| 125 |
+
bboxes_out = targets['boxes']
|
| 126 |
+
bboxes_out = bboxes_out.transpose(1, 2).contiguous()
|
| 127 |
+
# print(bboxes_out.is_contiguous())
|
| 128 |
+
labels_out = targets['labels']
|
| 129 |
+
# print(labels_out.is_contiguous())
|
| 130 |
+
|
| 131 |
+
# ploc, plabel, gloc, glabel
|
| 132 |
+
loss = self.compute_loss(locs, confs, bboxes_out, labels_out)
|
| 133 |
+
return {"total_losses": loss}
|
| 134 |
+
|
| 135 |
+
# 将预测回归参数叠加到default box上得到最终预测box,并执行非极大值抑制虑除重叠框
|
| 136 |
+
# results = self.encoder.decode_batch(locs, confs)
|
| 137 |
+
results = self.postprocess(locs, confs)
|
| 138 |
+
return results
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
class Loss(nn.Module):
|
| 142 |
+
"""
|
| 143 |
+
Implements the loss as the sum of the followings:
|
| 144 |
+
1. Confidence Loss: All labels, with hard negative mining
|
| 145 |
+
2. Localization Loss: Only on positive labels
|
| 146 |
+
Suppose input dboxes has the shape 8732x4
|
| 147 |
+
"""
|
| 148 |
+
def __init__(self, dboxes):
|
| 149 |
+
super(Loss, self).__init__()
|
| 150 |
+
# Two factor are from following links
|
| 151 |
+
# http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
|
| 152 |
+
self.scale_xy = 1.0 / dboxes.scale_xy # 10
|
| 153 |
+
self.scale_wh = 1.0 / dboxes.scale_wh # 5
|
| 154 |
+
|
| 155 |
+
self.location_loss = nn.SmoothL1Loss(reduction='none')
|
| 156 |
+
# [num_anchors, 4] -> [4, num_anchors] -> [1, 4, num_anchors]
|
| 157 |
+
self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0),
|
| 158 |
+
requires_grad=False)
|
| 159 |
+
|
| 160 |
+
self.confidence_loss = nn.CrossEntropyLoss(reduction='none')
|
| 161 |
+
|
| 162 |
+
def _location_vec(self, loc):
|
| 163 |
+
# type: (Tensor) -> Tensor
|
| 164 |
+
"""
|
| 165 |
+
Generate Location Vectors
|
| 166 |
+
计算ground truth相对anchors的回归参数
|
| 167 |
+
:param loc: anchor匹配到的对应GTBOX Nx4x8732
|
| 168 |
+
:return:
|
| 169 |
+
"""
|
| 170 |
+
gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, :] # Nx2x8732
|
| 171 |
+
gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log() # Nx2x8732
|
| 172 |
+
return torch.cat((gxy, gwh), dim=1).contiguous()
|
| 173 |
+
|
| 174 |
+
def forward(self, ploc, plabel, gloc, glabel):
|
| 175 |
+
# type: (Tensor, Tensor, Tensor, Tensor) -> Tensor
|
| 176 |
+
"""
|
| 177 |
+
ploc, plabel: Nx4x8732, Nxlabel_numx8732
|
| 178 |
+
predicted location and labels
|
| 179 |
+
|
| 180 |
+
gloc, glabel: Nx4x8732, Nx8732
|
| 181 |
+
ground truth location and labels
|
| 182 |
+
"""
|
| 183 |
+
# 获取正样本的mask Tensor: [N, 8732]
|
| 184 |
+
mask = torch.gt(glabel, 0) # (gt: >)
|
| 185 |
+
# mask1 = torch.nonzero(glabel)
|
| 186 |
+
# 计算一个batch中的每张图片的正样本个数 Tensor: [N]
|
| 187 |
+
pos_num = mask.sum(dim=1)
|
| 188 |
+
|
| 189 |
+
# 计算gt的location回归参数 Tensor: [N, 4, 8732]
|
| 190 |
+
vec_gd = self._location_vec(gloc)
|
| 191 |
+
|
| 192 |
+
# sum on four coordinates, and mask
|
| 193 |
+
# 计算定位损失(只有正样本)
|
| 194 |
+
loc_loss = self.location_loss(ploc, vec_gd).sum(dim=1) # Tensor: [N, 8732]
|
| 195 |
+
loc_loss = (mask.float() * loc_loss).sum(dim=1) # Tenosr: [N]
|
| 196 |
+
|
| 197 |
+
# hard negative mining Tenosr: [N, 8732]
|
| 198 |
+
con = self.confidence_loss(plabel, glabel)
|
| 199 |
+
|
| 200 |
+
# positive mask will never selected
|
| 201 |
+
# 获取负样本
|
| 202 |
+
con_neg = con.clone()
|
| 203 |
+
con_neg[mask] = 0.0
|
| 204 |
+
# 按照confidence_loss降序排列 con_idx(Tensor: [N, 8732])
|
| 205 |
+
_, con_idx = con_neg.sort(dim=1, descending=True)
|
| 206 |
+
_, con_rank = con_idx.sort(dim=1) # 这个步骤比较巧妙
|
| 207 |
+
|
| 208 |
+
# number of negative three times positive
|
| 209 |
+
# 用于损失计算的负样本数是正样本的3倍(在原论文Hard negative mining部分),
|
| 210 |
+
# 但不能超过总样本数8732
|
| 211 |
+
neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1)
|
| 212 |
+
neg_mask = torch.lt(con_rank, neg_num) # (lt: <) Tensor [N, 8732]
|
| 213 |
+
|
| 214 |
+
# confidence最终loss使用选取的正样本loss+选取的负样本loss
|
| 215 |
+
con_loss = (con * (mask.float() + neg_mask.float())).sum(dim=1) # Tensor [N]
|
| 216 |
+
|
| 217 |
+
# avoid no object detected
|
| 218 |
+
# 避免出现图像中没有GTBOX的情况
|
| 219 |
+
total_loss = loc_loss + con_loss
|
| 220 |
+
# eg. [15, 3, 5, 0] -> [1.0, 1.0, 1.0, 0.0]
|
| 221 |
+
num_mask = torch.gt(pos_num, 0).float() # 统计一个batch中的每张图像中是否存在正样本
|
| 222 |
+
pos_num = pos_num.float().clamp(min=1e-6) # 防止出现分母为零的情况
|
| 223 |
+
ret = (total_loss * num_mask / pos_num).mean(dim=0) # 只计算存在正样本的图像损失
|
| 224 |
+
return ret
|
| 225 |
+
|
sddfrcnn_model/backbone/utils.py
ADDED
|
@@ -0,0 +1,628 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from math import sqrt
|
| 2 |
+
import itertools
|
| 3 |
+
|
| 4 |
+
import torch
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
from torch.jit.annotations import Tuple, List
|
| 7 |
+
from torch import nn, Tensor
|
| 8 |
+
import numpy as np
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# This function is from https://github.com/kuangliu/pytorch-ssd.
|
| 12 |
+
# def calc_iou_tensor(box1, box2):
|
| 13 |
+
# """ Calculation of IoU based on two boxes tensor,
|
| 14 |
+
# Reference to https://github.com/kuangliu/pytorch-src
|
| 15 |
+
# input:
|
| 16 |
+
# box1 (N, 4) format [xmin, ymin, xmax, ymax]
|
| 17 |
+
# box2 (M, 4) format [xmin, ymin, xmax, ymax]
|
| 18 |
+
# output:
|
| 19 |
+
# IoU (N, M)
|
| 20 |
+
# """
|
| 21 |
+
# N = box1.size(0)
|
| 22 |
+
# M = box2.size(0)
|
| 23 |
+
#
|
| 24 |
+
# # (N, 4) -> (N, 1, 4) -> (N, M, 4)
|
| 25 |
+
# be1 = box1.unsqueeze(1).expand(-1, M, -1) # -1 means not changing the size of that dimension
|
| 26 |
+
# # (M, 4) -> (1, M, 4) -> (N, M, 4)
|
| 27 |
+
# be2 = box2.unsqueeze(0).expand(N, -1, -1)
|
| 28 |
+
#
|
| 29 |
+
# # Left Top and Right Bottom
|
| 30 |
+
# lt = torch.max(be1[:, :, :2], be2[:, :, :2])
|
| 31 |
+
# rb = torch.min(be1[:, :, 2:], be2[:, :, 2:])
|
| 32 |
+
#
|
| 33 |
+
# # compute intersection area
|
| 34 |
+
# delta = rb - lt # width and height
|
| 35 |
+
# delta[delta < 0] = 0
|
| 36 |
+
# # width * height
|
| 37 |
+
# intersect = delta[:, :, 0] * delta[:, :, 1]
|
| 38 |
+
#
|
| 39 |
+
# # compute bel1 area
|
| 40 |
+
# delta1 = be1[:, :, 2:] - be1[:, :, :2]
|
| 41 |
+
# area1 = delta1[:, :, 0] * delta1[:, :, 1]
|
| 42 |
+
# # compute bel2 area
|
| 43 |
+
# delta2 = be2[:, :, 2:] - be2[:, :, :2]
|
| 44 |
+
# area2 = delta2[:, :, 0] * delta2[:, :, 1]
|
| 45 |
+
#
|
| 46 |
+
# iou = intersect / (area1 + area2 - intersect)
|
| 47 |
+
# return iou
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def box_area(boxes):
|
| 51 |
+
"""
|
| 52 |
+
Computes the area of a set of bounding boxes, which are specified by its
|
| 53 |
+
(x1, y1, x2, y2) coordinates.
|
| 54 |
+
|
| 55 |
+
Arguments:
|
| 56 |
+
boxes (Tensor[N, 4]): boxes for which the area will be computed. They
|
| 57 |
+
are expected to be in (x1, y1, x2, y2) format
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
area (Tensor[N]): area for each box
|
| 61 |
+
"""
|
| 62 |
+
return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def calc_iou_tensor(boxes1, boxes2):
|
| 66 |
+
"""
|
| 67 |
+
Return intersection-over-union (Jaccard index) of boxes.
|
| 68 |
+
|
| 69 |
+
Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
|
| 70 |
+
|
| 71 |
+
Arguments:
|
| 72 |
+
boxes1 (Tensor[N, 4])
|
| 73 |
+
boxes2 (Tensor[M, 4])
|
| 74 |
+
|
| 75 |
+
Returns:
|
| 76 |
+
iou (Tensor[N, M]): the NxM matrix containing the pairwise
|
| 77 |
+
IoU values for every element in boxes1 and boxes2
|
| 78 |
+
"""
|
| 79 |
+
area1 = box_area(boxes1)
|
| 80 |
+
area2 = box_area(boxes2)
|
| 81 |
+
|
| 82 |
+
# When the shapes do not match,
|
| 83 |
+
# the shape of the returned output tensor follows the broadcasting rules
|
| 84 |
+
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # left-top [N,M,2]
|
| 85 |
+
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # right-bottom [N,M,2]
|
| 86 |
+
|
| 87 |
+
wh = (rb - lt).clamp(min=0) # [N,M,2]
|
| 88 |
+
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
|
| 89 |
+
|
| 90 |
+
iou = inter / (area1[:, None] + area2 - inter)
|
| 91 |
+
return iou
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# This function is from https://github.com/kuangliu/pytorch-ssd.
|
| 95 |
+
class Encoder(object):
|
| 96 |
+
"""
|
| 97 |
+
Inspired by https://github.com/kuangliu/pytorch-src
|
| 98 |
+
Transform between (bboxes, lables) <-> SSD output
|
| 99 |
+
|
| 100 |
+
dboxes: default boxes in size 8732 x 4,
|
| 101 |
+
encoder: input ltrb format, output xywh format
|
| 102 |
+
decoder: input xywh format, output ltrb format
|
| 103 |
+
|
| 104 |
+
encode:
|
| 105 |
+
input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
|
| 106 |
+
output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
|
| 107 |
+
criteria : IoU threshold of bboexes
|
| 108 |
+
|
| 109 |
+
decode:
|
| 110 |
+
input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
|
| 111 |
+
output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
|
| 112 |
+
criteria : IoU threshold of bboexes
|
| 113 |
+
max_output : maximum number of output bboxes
|
| 114 |
+
"""
|
| 115 |
+
def __init__(self, dboxes):
|
| 116 |
+
self.dboxes = dboxes(order='ltrb')
|
| 117 |
+
self.dboxes_xywh = dboxes(order='xywh').unsqueeze(dim=0)
|
| 118 |
+
self.nboxes = self.dboxes.size(0) # default boxes的数量
|
| 119 |
+
self.scale_xy = dboxes.scale_xy
|
| 120 |
+
self.scale_wh = dboxes.scale_wh
|
| 121 |
+
|
| 122 |
+
def encode(self, bboxes_in, labels_in, criteria=0.5):
|
| 123 |
+
"""
|
| 124 |
+
encode:
|
| 125 |
+
input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
|
| 126 |
+
output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
|
| 127 |
+
criteria : IoU threshold of bboexes
|
| 128 |
+
"""
|
| 129 |
+
# [nboxes, 8732]
|
| 130 |
+
ious = calc_iou_tensor(bboxes_in, self.dboxes) # 计算每个GT与default box的iou
|
| 131 |
+
# [8732,]
|
| 132 |
+
best_dbox_ious, best_dbox_idx = ious.max(dim=0) # 寻找每个default box匹配到的最大IoU
|
| 133 |
+
# [nboxes,]
|
| 134 |
+
best_bbox_ious, best_bbox_idx = ious.max(dim=1) # 寻找每个GT匹配到的最大IoU
|
| 135 |
+
|
| 136 |
+
# 将每个GT匹配到的最佳default box设置为正样本(对应论文中Matching strategy的第一条)
|
| 137 |
+
# set best ious 2.0
|
| 138 |
+
best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0) # dim, index, value
|
| 139 |
+
# 将相应default box匹配最大IOU的GT索引进行替换
|
| 140 |
+
idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
|
| 141 |
+
best_dbox_idx[best_bbox_idx[idx]] = idx
|
| 142 |
+
|
| 143 |
+
# filter IoU > 0.5
|
| 144 |
+
# 寻找与GT iou大于0.5的default box,对应论文中Matching strategy的第二条(这里包括了第一条匹配到的信息)
|
| 145 |
+
masks = best_dbox_ious > criteria
|
| 146 |
+
# [8732,]
|
| 147 |
+
labels_out = torch.zeros(self.nboxes, dtype=torch.int64)
|
| 148 |
+
labels_out[masks] = labels_in[best_dbox_idx[masks]]
|
| 149 |
+
# 将default box匹配到正样本的位置设置成对应GT的box信息
|
| 150 |
+
bboxes_out = self.dboxes.clone()
|
| 151 |
+
bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
|
| 152 |
+
|
| 153 |
+
# Transform format to xywh format
|
| 154 |
+
x = 0.5 * (bboxes_out[:, 0] + bboxes_out[:, 2]) # x
|
| 155 |
+
y = 0.5 * (bboxes_out[:, 1] + bboxes_out[:, 3]) # y
|
| 156 |
+
w = bboxes_out[:, 2] - bboxes_out[:, 0] # w
|
| 157 |
+
h = bboxes_out[:, 3] - bboxes_out[:, 1] # h
|
| 158 |
+
bboxes_out[:, 0] = x
|
| 159 |
+
bboxes_out[:, 1] = y
|
| 160 |
+
bboxes_out[:, 2] = w
|
| 161 |
+
bboxes_out[:, 3] = h
|
| 162 |
+
return bboxes_out, labels_out
|
| 163 |
+
|
| 164 |
+
def scale_back_batch(self, bboxes_in, scores_in):
|
| 165 |
+
"""
|
| 166 |
+
将box格式从xywh转换回ltrb, 将预测目标score通过softmax处理
|
| 167 |
+
Do scale and transform from xywh to ltrb
|
| 168 |
+
suppose input N x 4 x num_bbox | N x label_num x num_bbox
|
| 169 |
+
|
| 170 |
+
bboxes_in: 是网络预测的xywh回归参数
|
| 171 |
+
scores_in: 是预测的每个default box的各目标概率
|
| 172 |
+
"""
|
| 173 |
+
if bboxes_in.device == torch.device("cpu"):
|
| 174 |
+
self.dboxes = self.dboxes.cpu()
|
| 175 |
+
self.dboxes_xywh = self.dboxes_xywh.cpu()
|
| 176 |
+
else:
|
| 177 |
+
self.dboxes = self.dboxes.cuda()
|
| 178 |
+
self.dboxes_xywh = self.dboxes_xywh.cuda()
|
| 179 |
+
|
| 180 |
+
# Returns a view of the original tensor with its dimensions permuted.
|
| 181 |
+
bboxes_in = bboxes_in.permute(0, 2, 1)
|
| 182 |
+
scores_in = scores_in.permute(0, 2, 1)
|
| 183 |
+
# print(bboxes_in.is_contiguous())
|
| 184 |
+
|
| 185 |
+
bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2] # 预测的x, y回归参数
|
| 186 |
+
bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:] # 预测的w, h回归参数
|
| 187 |
+
|
| 188 |
+
# 将预测的回归参数叠加到default box上得到最终的预测边界框
|
| 189 |
+
bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
|
| 190 |
+
bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
|
| 191 |
+
|
| 192 |
+
# transform format to ltrb
|
| 193 |
+
l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
|
| 194 |
+
t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
|
| 195 |
+
r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
|
| 196 |
+
b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]
|
| 197 |
+
|
| 198 |
+
bboxes_in[:, :, 0] = l # xmin
|
| 199 |
+
bboxes_in[:, :, 1] = t # ymin
|
| 200 |
+
bboxes_in[:, :, 2] = r # xmax
|
| 201 |
+
bboxes_in[:, :, 3] = b # ymax
|
| 202 |
+
|
| 203 |
+
return bboxes_in, F.softmax(scores_in, dim=-1)
|
| 204 |
+
|
| 205 |
+
def decode_batch(self, bboxes_in, scores_in, criteria=0.45, max_output=200):
|
| 206 |
+
# 将box格式从xywh转换回ltrb(方便后面非极大值抑制时求iou), 将预测目标score通过softmax处理
|
| 207 |
+
bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
|
| 208 |
+
|
| 209 |
+
outputs = []
|
| 210 |
+
# 遍历一个batch中的每张image数据
|
| 211 |
+
for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
|
| 212 |
+
bbox = bbox.squeeze(0)
|
| 213 |
+
prob = prob.squeeze(0)
|
| 214 |
+
outputs.append(self.decode_single_new(bbox, prob, criteria, max_output))
|
| 215 |
+
return outputs
|
| 216 |
+
|
| 217 |
+
def decode_single_new(self, bboxes_in, scores_in, criteria, num_output=200):
|
| 218 |
+
"""
|
| 219 |
+
decode:
|
| 220 |
+
input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
|
| 221 |
+
output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
|
| 222 |
+
criteria : IoU threshold of bboexes
|
| 223 |
+
max_output : maximum number of output bboxes
|
| 224 |
+
"""
|
| 225 |
+
device = bboxes_in.device
|
| 226 |
+
num_classes = scores_in.shape[-1]
|
| 227 |
+
|
| 228 |
+
# 对越界的bbox进行裁剪
|
| 229 |
+
bboxes_in = bboxes_in.clamp(min=0, max=1)
|
| 230 |
+
|
| 231 |
+
# [8732, 4] -> [8732, 21, 4]
|
| 232 |
+
bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)
|
| 233 |
+
|
| 234 |
+
# create labels for each prediction
|
| 235 |
+
labels = torch.arange(num_classes, device=device)
|
| 236 |
+
labels = labels.view(1, -1).expand_as(scores_in)
|
| 237 |
+
|
| 238 |
+
# remove prediction with the background label
|
| 239 |
+
# 移除归为背景类别的概率信息
|
| 240 |
+
bboxes_in = bboxes_in[:, 1:, :]
|
| 241 |
+
scores_in = scores_in[:, 1:]
|
| 242 |
+
labels = labels[:, 1:]
|
| 243 |
+
|
| 244 |
+
# batch everything, by making every class prediction be a separate instance
|
| 245 |
+
bboxes_in = bboxes_in.reshape(-1, 4)
|
| 246 |
+
scores_in = scores_in.reshape(-1)
|
| 247 |
+
labels = labels.reshape(-1)
|
| 248 |
+
|
| 249 |
+
# remove low scoring boxes
|
| 250 |
+
# 移除低概率目标,self.scores_thresh=0.05
|
| 251 |
+
inds = torch.nonzero(scores_in > 0.05, as_tuple=False).squeeze(1)
|
| 252 |
+
bboxes_in, scores_in, labels = bboxes_in[inds], scores_in[inds], labels[inds]
|
| 253 |
+
|
| 254 |
+
# remove empty boxes
|
| 255 |
+
ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
|
| 256 |
+
keep = (ws >= 0.1 / 300) & (hs >= 0.1 / 300)
|
| 257 |
+
keep = keep.nonzero(as_tuple=False).squeeze(1)
|
| 258 |
+
bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]
|
| 259 |
+
|
| 260 |
+
# non-maximum suppression
|
| 261 |
+
keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)
|
| 262 |
+
|
| 263 |
+
# keep only topk scoring predictions
|
| 264 |
+
keep = keep[:num_output]
|
| 265 |
+
bboxes_out = bboxes_in[keep, :]
|
| 266 |
+
scores_out = scores_in[keep]
|
| 267 |
+
labels_out = labels[keep]
|
| 268 |
+
|
| 269 |
+
return bboxes_out, labels_out, scores_out
|
| 270 |
+
|
| 271 |
+
# perform non-maximum suppression
|
| 272 |
+
def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
|
| 273 |
+
"""
|
| 274 |
+
decode:
|
| 275 |
+
input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
|
| 276 |
+
output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
|
| 277 |
+
criteria : IoU threshold of bboexes
|
| 278 |
+
max_output : maximum number of output bboxes
|
| 279 |
+
"""
|
| 280 |
+
# Reference to https://github.com/amdegroot/ssd.pytorch
|
| 281 |
+
bboxes_out = []
|
| 282 |
+
scores_out = []
|
| 283 |
+
labels_out = []
|
| 284 |
+
|
| 285 |
+
# 非极大值抑制算法
|
| 286 |
+
# scores_in (Tensor 8732 x nitems), 遍历返回每一列数据,即8732个目标的同一类别的概率
|
| 287 |
+
for i, score in enumerate(scores_in.split(1, 1)):
|
| 288 |
+
# skip background
|
| 289 |
+
if i == 0:
|
| 290 |
+
continue
|
| 291 |
+
|
| 292 |
+
# [8732, 1] -> [8732]
|
| 293 |
+
score = score.squeeze(1)
|
| 294 |
+
|
| 295 |
+
# 虑除预测概率小于0.05的目标
|
| 296 |
+
mask = score > 0.05
|
| 297 |
+
bboxes, score = bboxes_in[mask, :], score[mask]
|
| 298 |
+
if score.size(0) == 0:
|
| 299 |
+
continue
|
| 300 |
+
|
| 301 |
+
# 按照分数从小到大排序
|
| 302 |
+
score_sorted, score_idx_sorted = score.sort(dim=0)
|
| 303 |
+
|
| 304 |
+
# select max_output indices
|
| 305 |
+
score_idx_sorted = score_idx_sorted[-max_num:]
|
| 306 |
+
candidates = []
|
| 307 |
+
|
| 308 |
+
while score_idx_sorted.numel() > 0:
|
| 309 |
+
idx = score_idx_sorted[-1].item()
|
| 310 |
+
# 获取排名前score_idx_sorted名的bboxes信息 Tensor:[score_idx_sorted, 4]
|
| 311 |
+
bboxes_sorted = bboxes[score_idx_sorted, :]
|
| 312 |
+
# 获取排名第一的bboxes信息 Tensor:[4]
|
| 313 |
+
bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
|
| 314 |
+
# 计算前score_idx_sorted名的bboxes与第一名的bboxes的iou
|
| 315 |
+
iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
|
| 316 |
+
|
| 317 |
+
# we only need iou < criteria
|
| 318 |
+
# 丢弃与第一名iou > criteria的所有目标(包括自己本身)
|
| 319 |
+
score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
|
| 320 |
+
# 保存第一名的索引信息
|
| 321 |
+
candidates.append(idx)
|
| 322 |
+
|
| 323 |
+
# 保存该类别通过非极大值抑制后的目标信息
|
| 324 |
+
bboxes_out.append(bboxes[candidates, :]) # bbox坐标信息
|
| 325 |
+
scores_out.append(score[candidates]) # score信息
|
| 326 |
+
labels_out.extend([i] * len(candidates)) # 标签信息
|
| 327 |
+
|
| 328 |
+
if not bboxes_out: # 如果为空的话,返回空tensor,注意boxes对应的空tensor size,防止验证时出错
|
| 329 |
+
return [torch.empty(size=(0, 4)), torch.empty(size=(0,), dtype=torch.int64), torch.empty(size=(0,))]
|
| 330 |
+
|
| 331 |
+
bboxes_out = torch.cat(bboxes_out, dim=0).contiguous()
|
| 332 |
+
scores_out = torch.cat(scores_out, dim=0).contiguous()
|
| 333 |
+
labels_out = torch.as_tensor(labels_out, dtype=torch.long)
|
| 334 |
+
|
| 335 |
+
# 对所有目标的概率进行排序(无论是什 么类别),取前max_num个目标
|
| 336 |
+
_, max_ids = scores_out.sort(dim=0)
|
| 337 |
+
max_ids = max_ids[-max_output:]
|
| 338 |
+
return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
class DefaultBoxes(object):
|
| 342 |
+
def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, scale_xy=0.1, scale_wh=0.2):
|
| 343 |
+
self.fig_size = fig_size # 输入网络的图像大小 300
|
| 344 |
+
# [38, 19, 10, 5, 3, 1]
|
| 345 |
+
self.feat_size = feat_size # 每个预测层的feature map尺寸
|
| 346 |
+
|
| 347 |
+
self.scale_xy_ = scale_xy
|
| 348 |
+
self.scale_wh_ = scale_wh
|
| 349 |
+
|
| 350 |
+
# According to https://github.com/weiliu89/caffe
|
| 351 |
+
# Calculation method slightly different from paper
|
| 352 |
+
# [8, 16, 32, 64, 100, 300]
|
| 353 |
+
self.steps = steps # 每个特征层上的一个cell在原图上的跨度
|
| 354 |
+
|
| 355 |
+
# [21, 45, 99, 153, 207, 261, 315]
|
| 356 |
+
self.scales = scales # 每个特征层上预测的default box的scale
|
| 357 |
+
|
| 358 |
+
fk = fig_size / np.array(steps) # 计算每层特征层的fk
|
| 359 |
+
# [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
|
| 360 |
+
self.aspect_ratios = aspect_ratios # 每个预测特征层上预测的default box的ratios
|
| 361 |
+
|
| 362 |
+
self.default_boxes = []
|
| 363 |
+
# size of feature and number of feature
|
| 364 |
+
# 遍历每层特征层,计算default box
|
| 365 |
+
for idx, sfeat in enumerate(self.feat_size):
|
| 366 |
+
sk1 = scales[idx] / fig_size # scale转为相对值[0-1]
|
| 367 |
+
sk2 = scales[idx + 1] / fig_size # scale转为相对值[0-1]
|
| 368 |
+
sk3 = sqrt(sk1 * sk2)
|
| 369 |
+
# 先添加两个1:1比例的default box宽和高
|
| 370 |
+
all_sizes = [(sk1, sk1), (sk3, sk3)]
|
| 371 |
+
|
| 372 |
+
# 再将剩下不同比例的default box宽和高添加到all_sizes中
|
| 373 |
+
for alpha in aspect_ratios[idx]:
|
| 374 |
+
w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
|
| 375 |
+
all_sizes.append((w, h))
|
| 376 |
+
all_sizes.append((h, w))
|
| 377 |
+
|
| 378 |
+
# 计算当前特征层对应原图上的所有default box
|
| 379 |
+
for w, h in all_sizes:
|
| 380 |
+
for i, j in itertools.product(range(sfeat), repeat=2): # i -> 行(y), j -> 列(x)
|
| 381 |
+
# 计算每个default box的中心坐标(范围是在0-1之间)
|
| 382 |
+
cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
|
| 383 |
+
self.default_boxes.append((cx, cy, w, h))
|
| 384 |
+
|
| 385 |
+
# 将default_boxes转为tensor格式
|
| 386 |
+
self.dboxes = torch.as_tensor(self.default_boxes, dtype=torch.float32) # 这里不转类型会报错
|
| 387 |
+
self.dboxes.clamp_(min=0, max=1) # 将坐标(x, y, w, h)都限制在0-1之间
|
| 388 |
+
|
| 389 |
+
# For IoU calculation
|
| 390 |
+
# ltrb is left top coordinate and right bottom coordinate
|
| 391 |
+
# 将(x, y, w, h)转换成(xmin, ymin, xmax, ymax),方便后续计算IoU(匹配正负样本时)
|
| 392 |
+
self.dboxes_ltrb = self.dboxes.clone()
|
| 393 |
+
self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2] # xmin
|
| 394 |
+
self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3] # ymin
|
| 395 |
+
self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2] # xmax
|
| 396 |
+
self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3] # ymax
|
| 397 |
+
|
| 398 |
+
@property
|
| 399 |
+
def scale_xy(self):
|
| 400 |
+
return self.scale_xy_
|
| 401 |
+
|
| 402 |
+
@property
|
| 403 |
+
def scale_wh(self):
|
| 404 |
+
return self.scale_wh_
|
| 405 |
+
|
| 406 |
+
def __call__(self, order='ltrb'):
|
| 407 |
+
# 根据需求返回对应格式的default box
|
| 408 |
+
if order == 'ltrb':
|
| 409 |
+
return self.dboxes_ltrb
|
| 410 |
+
|
| 411 |
+
if order == 'xywh':
|
| 412 |
+
return self.dboxes
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
def dboxes300_coco():
|
| 416 |
+
figsize = 300 # 输入网络的图像大小
|
| 417 |
+
feat_size = [38, 19, 10, 5, 3, 1] # 每个预测层的feature map尺寸
|
| 418 |
+
steps = [8, 16, 32, 64, 100, 300] # 每个特征层上的一个cell在原图上的跨度
|
| 419 |
+
# use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
|
| 420 |
+
scales = [21, 45, 99, 153, 207, 261, 315] # 每个特征层上预测的default box的scale
|
| 421 |
+
aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] # 每个预测特征层上预测的default box的ratios
|
| 422 |
+
dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
|
| 423 |
+
return dboxes
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
def nms(boxes, scores, iou_threshold):
|
| 427 |
+
# type: (Tensor, Tensor, float) -> Tensor
|
| 428 |
+
"""
|
| 429 |
+
Performs non-maximum suppression (NMS) on the boxes according
|
| 430 |
+
to their intersection-over-union (IoU).
|
| 431 |
+
|
| 432 |
+
NMS iteratively removes lower scoring boxes which have an
|
| 433 |
+
IoU greater than iou_threshold with another (higher scoring)
|
| 434 |
+
box.
|
| 435 |
+
|
| 436 |
+
Parameters
|
| 437 |
+
----------
|
| 438 |
+
boxes : Tensor[N, 4])
|
| 439 |
+
boxes to perform NMS on. They
|
| 440 |
+
are expected to be in (x1, y1, x2, y2) format
|
| 441 |
+
scores : Tensor[N]
|
| 442 |
+
scores for each one of the boxes
|
| 443 |
+
iou_threshold : float
|
| 444 |
+
discards all overlapping
|
| 445 |
+
boxes with IoU < iou_threshold
|
| 446 |
+
|
| 447 |
+
Returns
|
| 448 |
+
-------
|
| 449 |
+
keep : Tensor
|
| 450 |
+
int64 tensor with the indices
|
| 451 |
+
of the elements that have been kept
|
| 452 |
+
by NMS, sorted in decreasing order of scores
|
| 453 |
+
"""
|
| 454 |
+
return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def batched_nms(boxes, scores, idxs, iou_threshold):
|
| 458 |
+
# type: (Tensor, Tensor, Tensor, float) -> Tensor
|
| 459 |
+
"""
|
| 460 |
+
Performs non-maximum suppression in a batched fashion.
|
| 461 |
+
|
| 462 |
+
Each index value correspond to a category, and NMS
|
| 463 |
+
will not be applied between elements of different categories.
|
| 464 |
+
|
| 465 |
+
Parameters
|
| 466 |
+
----------
|
| 467 |
+
boxes : Tensor[N, 4]
|
| 468 |
+
boxes where NMS will be performed. They
|
| 469 |
+
are expected to be in (x1, y1, x2, y2) format
|
| 470 |
+
scores : Tensor[N]
|
| 471 |
+
scores for each one of the boxes
|
| 472 |
+
idxs : Tensor[N]
|
| 473 |
+
indices of the categories for each one of the boxes.
|
| 474 |
+
iou_threshold : float
|
| 475 |
+
discards all overlapping boxes
|
| 476 |
+
with IoU < iou_threshold
|
| 477 |
+
|
| 478 |
+
Returns
|
| 479 |
+
-------
|
| 480 |
+
keep : Tensor
|
| 481 |
+
int64 tensor with the indices of
|
| 482 |
+
the elements that have been kept by NMS, sorted
|
| 483 |
+
in decreasing order of scores
|
| 484 |
+
"""
|
| 485 |
+
if boxes.numel() == 0:
|
| 486 |
+
return torch.empty((0,), dtype=torch.int64, device=boxes.device)
|
| 487 |
+
|
| 488 |
+
# strategy: in order to perform NMS independently per class.
|
| 489 |
+
# we add an offset to all the boxes. The offset is dependent
|
| 490 |
+
# only on the class idx, and is large enough so that boxes
|
| 491 |
+
# from different classes do not overlap
|
| 492 |
+
# 获取所有boxes中最大的坐标值(xmin, ymin, xmax, ymax)
|
| 493 |
+
max_coordinate = boxes.max()
|
| 494 |
+
|
| 495 |
+
# to(): Performs Tensor dtype and/or device conversion
|
| 496 |
+
# 为每一个类别生成一个很大的偏移量
|
| 497 |
+
# 这里的to只是让生成tensor的dytpe和device与boxes保持一致
|
| 498 |
+
offsets = idxs.to(boxes) * (max_coordinate + 1)
|
| 499 |
+
# boxes加上对应层的偏移量后,保证不同类别之间boxes不会有重合的现象
|
| 500 |
+
boxes_for_nms = boxes + offsets[:, None]
|
| 501 |
+
keep = nms(boxes_for_nms, scores, iou_threshold)
|
| 502 |
+
return keep
|
| 503 |
+
|
| 504 |
+
|
| 505 |
+
class PostProcess(nn.Module):
|
| 506 |
+
def __init__(self, dboxes):
|
| 507 |
+
super(PostProcess, self).__init__()
|
| 508 |
+
# [num_anchors, 4] -> [1, num_anchors, 4]
|
| 509 |
+
self.dboxes_xywh = nn.Parameter(dboxes(order='xywh').unsqueeze(dim=0),
|
| 510 |
+
requires_grad=False)
|
| 511 |
+
self.scale_xy = dboxes.scale_xy # 0.1
|
| 512 |
+
self.scale_wh = dboxes.scale_wh # 0.2
|
| 513 |
+
|
| 514 |
+
self.criteria = 0.5
|
| 515 |
+
self.max_output = 100
|
| 516 |
+
|
| 517 |
+
def scale_back_batch(self, bboxes_in, scores_in):
|
| 518 |
+
# type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
|
| 519 |
+
"""
|
| 520 |
+
1)通过预测的boxes回归参数得到最终预测坐标
|
| 521 |
+
2)将box格式从xywh转换回ltrb
|
| 522 |
+
3)将预测目标score通过softmax处理
|
| 523 |
+
Do scale and transform from xywh to ltrb
|
| 524 |
+
suppose input N x 4 x num_bbox | N x label_num x num_bbox
|
| 525 |
+
|
| 526 |
+
bboxes_in: [N, 4, 8732]是网络预测的xywh回归参数
|
| 527 |
+
scores_in: [N, label_num, 8732]是预测的每个default box的各目标概率
|
| 528 |
+
"""
|
| 529 |
+
|
| 530 |
+
# Returns a view of the original tensor with its dimensions permuted.
|
| 531 |
+
# [batch, 4, 8732] -> [batch, 8732, 4]
|
| 532 |
+
bboxes_in = bboxes_in.permute(0, 2, 1)
|
| 533 |
+
# [batch, label_num, 8732] -> [batch, 8732, label_num]
|
| 534 |
+
scores_in = scores_in.permute(0, 2, 1)
|
| 535 |
+
# print(bboxes_in.is_contiguous())
|
| 536 |
+
|
| 537 |
+
bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2] # 预测的x, y回归参数
|
| 538 |
+
bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:] # 预测的w, h回归参数
|
| 539 |
+
|
| 540 |
+
# 将预测的回归参数叠加到default box上得到最终的预测边界框
|
| 541 |
+
bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
|
| 542 |
+
bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
|
| 543 |
+
|
| 544 |
+
# transform format to ltrb
|
| 545 |
+
l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
|
| 546 |
+
t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
|
| 547 |
+
r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
|
| 548 |
+
b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]
|
| 549 |
+
|
| 550 |
+
bboxes_in[:, :, 0] = l # xmin
|
| 551 |
+
bboxes_in[:, :, 1] = t # ymin
|
| 552 |
+
bboxes_in[:, :, 2] = r # xmax
|
| 553 |
+
bboxes_in[:, :, 3] = b # ymax
|
| 554 |
+
|
| 555 |
+
# scores_in: [batch, 8732, label_num]
|
| 556 |
+
return bboxes_in, F.softmax(scores_in, dim=-1)
|
| 557 |
+
|
| 558 |
+
def decode_single_new(self, bboxes_in, scores_in, criteria, num_output):
|
| 559 |
+
# type: (Tensor, Tensor, float, int) -> Tuple[Tensor, Tensor, Tensor]
|
| 560 |
+
"""
|
| 561 |
+
decode:
|
| 562 |
+
input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
|
| 563 |
+
output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
|
| 564 |
+
criteria : IoU threshold of bboexes
|
| 565 |
+
max_output : maximum number of output bboxes
|
| 566 |
+
"""
|
| 567 |
+
device = bboxes_in.device
|
| 568 |
+
num_classes = scores_in.shape[-1]
|
| 569 |
+
|
| 570 |
+
# 对越界的bbox进行裁剪
|
| 571 |
+
bboxes_in = bboxes_in.clamp(min=0, max=1)
|
| 572 |
+
|
| 573 |
+
# [8732, 4] -> [8732, 21, 4]
|
| 574 |
+
bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)
|
| 575 |
+
|
| 576 |
+
# create labels for each prediction
|
| 577 |
+
labels = torch.arange(num_classes, device=device)
|
| 578 |
+
# [num_classes] -> [8732, num_classes]
|
| 579 |
+
labels = labels.view(1, -1).expand_as(scores_in)
|
| 580 |
+
|
| 581 |
+
# remove prediction with the background label
|
| 582 |
+
# 移除归为背景类别的概率信息
|
| 583 |
+
bboxes_in = bboxes_in[:, 1:, :] # [8732, 21, 4] -> [8732, 20, 4]
|
| 584 |
+
scores_in = scores_in[:, 1:] # [8732, 21] -> [8732, 20]
|
| 585 |
+
labels = labels[:, 1:] # [8732, 21] -> [8732, 20]
|
| 586 |
+
|
| 587 |
+
# batch everything, by making every class prediction be a separate instance
|
| 588 |
+
bboxes_in = bboxes_in.reshape(-1, 4) # [8732, 20, 4] -> [8732x20, 4]
|
| 589 |
+
scores_in = scores_in.reshape(-1) # [8732, 20] -> [8732x20]
|
| 590 |
+
labels = labels.reshape(-1) # [8732, 20] -> [8732x20]
|
| 591 |
+
|
| 592 |
+
# remove low scoring boxes
|
| 593 |
+
# 移除低概率目标,self.scores_thresh=0.05
|
| 594 |
+
# inds = torch.nonzero(scores_in > 0.05).squeeze(1)
|
| 595 |
+
inds = torch.where(torch.gt(scores_in, 0.05))[0]
|
| 596 |
+
bboxes_in, scores_in, labels = bboxes_in[inds, :], scores_in[inds], labels[inds]
|
| 597 |
+
|
| 598 |
+
# remove empty boxes
|
| 599 |
+
ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
|
| 600 |
+
keep = (ws >= 1 / 300) & (hs >= 1 / 300)
|
| 601 |
+
# keep = keep.nonzero().squeeze(1)
|
| 602 |
+
keep = torch.where(keep)[0]
|
| 603 |
+
bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]
|
| 604 |
+
|
| 605 |
+
# non-maximum suppression
|
| 606 |
+
keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)
|
| 607 |
+
|
| 608 |
+
# keep only topk scoring predictions
|
| 609 |
+
keep = keep[:num_output]
|
| 610 |
+
bboxes_out = bboxes_in[keep, :]
|
| 611 |
+
scores_out = scores_in[keep]
|
| 612 |
+
labels_out = labels[keep]
|
| 613 |
+
|
| 614 |
+
return bboxes_out, labels_out, scores_out
|
| 615 |
+
|
| 616 |
+
def forward(self, bboxes_in, scores_in):
|
| 617 |
+
# 通过预测的boxes回归参数得到最终预测坐标, 将预测目标score通过softmax处理
|
| 618 |
+
bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
|
| 619 |
+
|
| 620 |
+
outputs = torch.jit.annotate(List[Tuple[Tensor, Tensor, Tensor]], [])
|
| 621 |
+
# 遍历一个batch中的每张image数据
|
| 622 |
+
# bboxes: [batch, 8732, 4]
|
| 623 |
+
for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)): # split_size, split_dim
|
| 624 |
+
# bbox: [1, 8732, 4]
|
| 625 |
+
bbox = bbox.squeeze(0)
|
| 626 |
+
prob = prob.squeeze(0)
|
| 627 |
+
outputs.append(self.decode_single_new(bbox, prob, self.criteria, self.max_output))
|
| 628 |
+
return outputs
|
sddfrcnn_model/draw_box_utils.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL.Image import Image, fromarray
|
| 2 |
+
import PIL.ImageDraw as ImageDraw
|
| 3 |
+
import PIL.ImageFont as ImageFont
|
| 4 |
+
from PIL import ImageColor
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
STANDARD_COLORS = [
|
| 8 |
+
'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
|
| 9 |
+
'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
|
| 10 |
+
'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
|
| 11 |
+
'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
|
| 12 |
+
'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
|
| 13 |
+
'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
|
| 14 |
+
'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
|
| 15 |
+
'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
|
| 16 |
+
'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
|
| 17 |
+
'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
|
| 18 |
+
'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
|
| 19 |
+
'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
|
| 20 |
+
'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
|
| 21 |
+
'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
|
| 22 |
+
'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
|
| 23 |
+
'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
|
| 24 |
+
'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
|
| 25 |
+
'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
|
| 26 |
+
'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
|
| 27 |
+
'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
|
| 28 |
+
'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
|
| 29 |
+
'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
|
| 30 |
+
'WhiteSmoke', 'Yellow', 'YellowGreen'
|
| 31 |
+
]
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def draw_text(draw,
|
| 35 |
+
box: list,
|
| 36 |
+
cls: int,
|
| 37 |
+
score: float,
|
| 38 |
+
category_index: dict,
|
| 39 |
+
color: str,
|
| 40 |
+
font: str = 'arial.ttf',
|
| 41 |
+
font_size: int = 24):
|
| 42 |
+
"""
|
| 43 |
+
将目标边界框和类别信息绘制到图片上
|
| 44 |
+
"""
|
| 45 |
+
try:
|
| 46 |
+
font = ImageFont.truetype(font, font_size)
|
| 47 |
+
except IOError:
|
| 48 |
+
font = ImageFont.load_default()
|
| 49 |
+
|
| 50 |
+
left, top, right, bottom = box
|
| 51 |
+
# If the total height of the display strings added to the top of the bounding
|
| 52 |
+
# box exceeds the top of the image, stack the strings below the bounding box
|
| 53 |
+
# instead of above.
|
| 54 |
+
display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
|
| 55 |
+
"""
|
| 56 |
+
display_str_heights = [draw.textsize(ds,font=font)[1] for ds in display_str]
|
| 57 |
+
"""
|
| 58 |
+
bbox = draw.textbbox((0, 0), display_str, font=font)
|
| 59 |
+
text_width = bbox[2] - bbox[0]
|
| 60 |
+
text_height = bbox[3] - bbox[1]
|
| 61 |
+
# Each display_str has a top and bottom margin of 0.05x.
|
| 62 |
+
display_str_height = (1 + 2 * 0.05) * text_height
|
| 63 |
+
|
| 64 |
+
if top > display_str_height:
|
| 65 |
+
text_top = top - display_str_height
|
| 66 |
+
text_bottom = top
|
| 67 |
+
else:
|
| 68 |
+
text_top = bottom
|
| 69 |
+
text_bottom = bottom + display_str_height
|
| 70 |
+
margin = np.ceil(0.05 * text_width)
|
| 71 |
+
text_rect_left = left
|
| 72 |
+
text_rect_right = left + text_width + 2 * margin
|
| 73 |
+
|
| 74 |
+
# 确保文本矩形不超出图像边界
|
| 75 |
+
img_width, img_height = draw.im.size
|
| 76 |
+
if text_rect_right > img_width:
|
| 77 |
+
text_rect_right = img_width
|
| 78 |
+
text_rect_left = max(0, img_width - text_width - 2 * margin)
|
| 79 |
+
|
| 80 |
+
if text_bottom > img_height:
|
| 81 |
+
text_bottom = img_height
|
| 82 |
+
text_top = max(0, img_height - display_str_height)
|
| 83 |
+
|
| 84 |
+
# 绘制文本背景和文本
|
| 85 |
+
draw.rectangle([(text_rect_left, text_top),
|
| 86 |
+
(text_rect_right, text_bottom)], fill=color)
|
| 87 |
+
draw.text((text_rect_left + margin, text_top),
|
| 88 |
+
display_str,
|
| 89 |
+
fill='black',
|
| 90 |
+
font=font)
|
| 91 |
+
'''
|
| 92 |
+
for ds in display_str:
|
| 93 |
+
"""
|
| 94 |
+
text_width, text_height = draw.textsize(text, font=font)
|
| 95 |
+
"""
|
| 96 |
+
bbox = draw.textbbox((0, 0), display_str, font=font)
|
| 97 |
+
text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
| 98 |
+
margin = np.ceil(0.05 * text_width)
|
| 99 |
+
draw.rectangle([(left, text_top),
|
| 100 |
+
(left + text_width + 2 * margin, text_bottom)], fill=color)
|
| 101 |
+
draw.text((left + margin, text_top),
|
| 102 |
+
ds,
|
| 103 |
+
fill='black',
|
| 104 |
+
font=font)
|
| 105 |
+
left += text_width
|
| 106 |
+
'''
|
| 107 |
+
|
| 108 |
+
def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
|
| 109 |
+
np_image = np.array(image)
|
| 110 |
+
masks = np.where(masks > thresh, True, False)
|
| 111 |
+
|
| 112 |
+
# colors = np.array(colors)
|
| 113 |
+
img_to_draw = np.copy(np_image)
|
| 114 |
+
# TODO: There might be a way to vectorize this
|
| 115 |
+
for mask, color in zip(masks, colors):
|
| 116 |
+
img_to_draw[mask] = color
|
| 117 |
+
|
| 118 |
+
out = np_image * (1 - alpha) + img_to_draw * alpha
|
| 119 |
+
return fromarray(out.astype(np.uint8))
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def draw_objs(image: Image,
|
| 123 |
+
boxes: np.ndarray = None,
|
| 124 |
+
classes: np.ndarray = None,
|
| 125 |
+
scores: np.ndarray = None,
|
| 126 |
+
masks: np.ndarray = None,
|
| 127 |
+
category_index: dict = None,
|
| 128 |
+
box_thresh: float = 0.1,
|
| 129 |
+
mask_thresh: float = 0.5,
|
| 130 |
+
line_thickness: int = 8,
|
| 131 |
+
font: str = 'arial.ttf',
|
| 132 |
+
font_size: int = 24,
|
| 133 |
+
draw_boxes_on_image: bool = True,
|
| 134 |
+
draw_masks_on_image: bool = False):
|
| 135 |
+
"""
|
| 136 |
+
将目标边界框信息,类别信息,mask信息绘制在图片上
|
| 137 |
+
Args:
|
| 138 |
+
image: 需要绘制的图片
|
| 139 |
+
boxes: 目标边界框信息
|
| 140 |
+
classes: 目标类别信息
|
| 141 |
+
scores: 目标概率信息
|
| 142 |
+
masks: 目标mask信息
|
| 143 |
+
category_index: 类别与名称字典
|
| 144 |
+
box_thresh: 过滤的概率阈值
|
| 145 |
+
mask_thresh:
|
| 146 |
+
line_thickness: 边界框宽度
|
| 147 |
+
font: 字体类型
|
| 148 |
+
font_size: 字体大小
|
| 149 |
+
draw_boxes_on_image:
|
| 150 |
+
draw_masks_on_image:
|
| 151 |
+
|
| 152 |
+
Returns:
|
| 153 |
+
|
| 154 |
+
"""
|
| 155 |
+
|
| 156 |
+
# 过滤掉低概率的目标
|
| 157 |
+
idxs = np.greater(scores, box_thresh)
|
| 158 |
+
boxes = boxes[idxs]
|
| 159 |
+
classes = classes[idxs]
|
| 160 |
+
scores = scores[idxs]
|
| 161 |
+
if masks is not None:
|
| 162 |
+
masks = masks[idxs]
|
| 163 |
+
if len(boxes) == 0:
|
| 164 |
+
return image
|
| 165 |
+
|
| 166 |
+
colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
|
| 167 |
+
|
| 168 |
+
if draw_boxes_on_image:
|
| 169 |
+
# Draw all boxes onto image.
|
| 170 |
+
draw = ImageDraw.Draw(image)
|
| 171 |
+
for box, cls, score, color in zip(boxes, classes, scores, colors):
|
| 172 |
+
left, top, right, bottom = box
|
| 173 |
+
# 绘制目标边界框
|
| 174 |
+
|
| 175 |
+
img_width, img_height = image.size
|
| 176 |
+
left = max(0, min(left, img_width - 1))
|
| 177 |
+
top = max(0, min(top, img_height - 1))
|
| 178 |
+
right = max(0, min(right, img_width - 1))
|
| 179 |
+
bottom = max(0, min(bottom, img_height - 1))
|
| 180 |
+
|
| 181 |
+
# 绘制目标边界框(四条线段,而不是五条)
|
| 182 |
+
draw.line([(left, top), (right, top)], width=line_thickness, fill=color) # 上边
|
| 183 |
+
draw.line([(right, top), (right, bottom)], width=line_thickness, fill=color) # 右边
|
| 184 |
+
draw.line([(right, bottom), (left, bottom)], width=line_thickness, fill=color) # 下边
|
| 185 |
+
draw.line([(left, bottom), (left, top)], width=line_thickness, fill=color) # 左边
|
| 186 |
+
'''
|
| 187 |
+
draw.line([(left, top), (left, bottom), (right, bottom),
|
| 188 |
+
(right, top), (left, top)], width=line_thickness, fill=color)
|
| 189 |
+
'''
|
| 190 |
+
# 绘制类别和概率信息
|
| 191 |
+
draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
|
| 192 |
+
|
| 193 |
+
if draw_masks_on_image and (masks is not None):
|
| 194 |
+
# Draw all mask onto image.
|
| 195 |
+
image = draw_masks(image, masks, colors, mask_thresh)
|
| 196 |
+
|
| 197 |
+
return image
|
sddfrcnn_model/network_files/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .retinanet import RetinaNet
|
sddfrcnn_model/network_files/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (212 Bytes). View file
|
|
|
sddfrcnn_model/network_files/__pycache__/anchor_utils.cpython-310.pyc
ADDED
|
Binary file (5.19 kB). View file
|
|
|
sddfrcnn_model/network_files/__pycache__/boxes.cpython-310.pyc
ADDED
|
Binary file (5.12 kB). View file
|
|
|
sddfrcnn_model/network_files/__pycache__/det_utils.cpython-310.pyc
ADDED
|
Binary file (10.6 kB). View file
|
|
|
sddfrcnn_model/network_files/__pycache__/image_list.cpython-310.pyc
ADDED
|
Binary file (1.15 kB). View file
|
|
|
sddfrcnn_model/network_files/__pycache__/losses.cpython-310.pyc
ADDED
|
Binary file (1.91 kB). View file
|
|
|
sddfrcnn_model/network_files/__pycache__/retinanet.cpython-310.pyc
ADDED
|
Binary file (16.2 kB). View file
|
|
|
sddfrcnn_model/network_files/__pycache__/transform.cpython-310.pyc
ADDED
|
Binary file (8.89 kB). View file
|
|
|
sddfrcnn_model/network_files/anchor_utils.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional, Dict
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
from torch import nn, Tensor
|
| 5 |
+
|
| 6 |
+
from .image_list import ImageList
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class AnchorsGenerator(nn.Module):
|
| 10 |
+
__annotations__ = {
|
| 11 |
+
"cell_anchors": Optional[List[torch.Tensor]],
|
| 12 |
+
"_cache": Dict[str, List[torch.Tensor]]
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
"""
|
| 16 |
+
anchors生成器
|
| 17 |
+
Module that generates anchors for a set of feature maps and
|
| 18 |
+
image sizes.
|
| 19 |
+
|
| 20 |
+
The module support computing anchors at multiple sizes and aspect ratios
|
| 21 |
+
per feature map.
|
| 22 |
+
|
| 23 |
+
sizes and aspect_ratios should have the same number of elements, and it should
|
| 24 |
+
correspond to the number of feature maps.
|
| 25 |
+
|
| 26 |
+
sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
|
| 27 |
+
and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
|
| 28 |
+
per spatial location for feature map i.
|
| 29 |
+
|
| 30 |
+
Arguments:
|
| 31 |
+
sizes (Tuple[Tuple[int]]):
|
| 32 |
+
aspect_ratios (Tuple[Tuple[float]]):
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
|
| 36 |
+
super(AnchorsGenerator, self).__init__()
|
| 37 |
+
|
| 38 |
+
if not isinstance(sizes[0], (list, tuple)):
|
| 39 |
+
# TODO change this
|
| 40 |
+
sizes = tuple((s,) for s in sizes)
|
| 41 |
+
if not isinstance(aspect_ratios[0], (list, tuple)):
|
| 42 |
+
aspect_ratios = (aspect_ratios,) * len(sizes)
|
| 43 |
+
|
| 44 |
+
assert len(sizes) == len(aspect_ratios)
|
| 45 |
+
|
| 46 |
+
self.sizes = sizes
|
| 47 |
+
self.aspect_ratios = aspect_ratios
|
| 48 |
+
self.cell_anchors = None
|
| 49 |
+
self._cache = {}
|
| 50 |
+
|
| 51 |
+
def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device=torch.device("cpu")):
|
| 52 |
+
# type: (List[int], List[float], torch.dtype, torch.device) -> Tensor
|
| 53 |
+
"""
|
| 54 |
+
compute anchor sizes
|
| 55 |
+
Arguments:
|
| 56 |
+
scales: sqrt(anchor_area)
|
| 57 |
+
aspect_ratios: h/w ratios
|
| 58 |
+
dtype: float32
|
| 59 |
+
device: cpu/gpu
|
| 60 |
+
"""
|
| 61 |
+
scales = torch.as_tensor(scales, dtype=dtype, device=device)
|
| 62 |
+
aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
|
| 63 |
+
h_ratios = torch.sqrt(aspect_ratios)
|
| 64 |
+
w_ratios = 1.0 / h_ratios
|
| 65 |
+
|
| 66 |
+
# [r1, r2, r3]' * [s1, s2, s3]
|
| 67 |
+
# number of elements is len(ratios)*len(scales)
|
| 68 |
+
ws = (w_ratios[:, None] * scales[None, :]).view(-1)
|
| 69 |
+
hs = (h_ratios[:, None] * scales[None, :]).view(-1)
|
| 70 |
+
|
| 71 |
+
# left-top, right-bottom coordinate relative to anchor center(0, 0)
|
| 72 |
+
# 生成的anchors模板都是以(0, 0)为中心的, shape [len(ratios)*len(scales), 4]
|
| 73 |
+
base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
|
| 74 |
+
|
| 75 |
+
return base_anchors.round() # round 四舍五入
|
| 76 |
+
|
| 77 |
+
def set_cell_anchors(self, dtype, device):
|
| 78 |
+
# type: (torch.dtype, torch.device) -> None
|
| 79 |
+
if self.cell_anchors is not None:
|
| 80 |
+
cell_anchors = self.cell_anchors
|
| 81 |
+
assert cell_anchors is not None
|
| 82 |
+
# suppose that all anchors have the same device
|
| 83 |
+
# which is a valid assumption in the current state of the codebase
|
| 84 |
+
if cell_anchors[0].device == device:
|
| 85 |
+
return
|
| 86 |
+
|
| 87 |
+
# 根据提供的sizes和aspect_ratios生成anchors模板
|
| 88 |
+
# anchors模板都是以(0, 0)为中心的anchor
|
| 89 |
+
cell_anchors = [
|
| 90 |
+
self.generate_anchors(sizes, aspect_ratios, dtype, device)
|
| 91 |
+
for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)
|
| 92 |
+
]
|
| 93 |
+
self.cell_anchors = cell_anchors
|
| 94 |
+
|
| 95 |
+
def num_anchors_per_location(self):
|
| 96 |
+
# 计算每个预测特征层上每个滑动窗口的预测目标数
|
| 97 |
+
return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
|
| 98 |
+
|
| 99 |
+
# For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
|
| 100 |
+
# output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
|
| 101 |
+
def grid_anchors(self, grid_sizes, strides):
|
| 102 |
+
# type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
|
| 103 |
+
"""
|
| 104 |
+
anchors position in grid coordinate axis map into origin image
|
| 105 |
+
计算预测特征图对应原始图像上的所有anchors的坐标
|
| 106 |
+
Args:
|
| 107 |
+
grid_sizes: 预测特征矩阵的height和width
|
| 108 |
+
strides: 预测特征矩阵上一步对应原始图像上的步距
|
| 109 |
+
"""
|
| 110 |
+
anchors = []
|
| 111 |
+
cell_anchors = self.cell_anchors
|
| 112 |
+
assert cell_anchors is not None
|
| 113 |
+
|
| 114 |
+
# 遍历每个预测特征层的grid_size,strides和cell_anchors
|
| 115 |
+
for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
|
| 116 |
+
grid_height, grid_width = size
|
| 117 |
+
stride_height, stride_width = stride
|
| 118 |
+
device = base_anchors.device
|
| 119 |
+
|
| 120 |
+
# For output anchor, compute [x_center, y_center, x_center, y_center]
|
| 121 |
+
# shape: [grid_width] 对应原图上的x坐标(列)
|
| 122 |
+
shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width
|
| 123 |
+
# shape: [grid_height] 对应原图上的y坐标(行)
|
| 124 |
+
shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height
|
| 125 |
+
|
| 126 |
+
# 计算预测特征矩阵上每个点对应原图上的坐标(anchors模板的坐标偏移量)
|
| 127 |
+
# torch.meshgrid函数分别传入行坐标和列坐标,生成网格行坐标矩阵和网格列坐标矩阵
|
| 128 |
+
# shape: [grid_height, grid_width]
|
| 129 |
+
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
|
| 130 |
+
shift_x = shift_x.reshape(-1)
|
| 131 |
+
shift_y = shift_y.reshape(-1)
|
| 132 |
+
|
| 133 |
+
# 计算anchors坐标(xmin, ymin, xmax, ymax)在原图上的坐标偏移量
|
| 134 |
+
# shape: [grid_width*grid_height, 4]
|
| 135 |
+
shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
|
| 136 |
+
|
| 137 |
+
# For every (base anchor, output anchor) pair,
|
| 138 |
+
# offset each zero-centered base anchor by the center of the output anchor.
|
| 139 |
+
# 将anchors模板与原图上的坐标偏移量相加得到原图上所有anchors的坐标信息(shape不同时会使用广播机制)
|
| 140 |
+
shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
|
| 141 |
+
anchors.append(shifts_anchor.reshape(-1, 4))
|
| 142 |
+
|
| 143 |
+
return anchors # List[Tensor(all_num_anchors, 4)]
|
| 144 |
+
|
| 145 |
+
def cached_grid_anchors(self, grid_sizes, strides):
|
| 146 |
+
# type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
|
| 147 |
+
"""将计算得到的所有anchors信息进行缓存"""
|
| 148 |
+
key = str(grid_sizes) + str(strides)
|
| 149 |
+
# self._cache是字典类型
|
| 150 |
+
if key in self._cache:
|
| 151 |
+
return self._cache[key]
|
| 152 |
+
anchors = self.grid_anchors(grid_sizes, strides)
|
| 153 |
+
self._cache[key] = anchors
|
| 154 |
+
return anchors
|
| 155 |
+
|
| 156 |
+
def forward(self, image_list, feature_maps):
|
| 157 |
+
# type: (ImageList, List[Tensor]) -> List[Tensor]
|
| 158 |
+
# 获取每个预测特征层的尺寸(height, width)
|
| 159 |
+
grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
|
| 160 |
+
|
| 161 |
+
# 获取输入图像的height和width
|
| 162 |
+
image_size = image_list.tensors.shape[-2:]
|
| 163 |
+
|
| 164 |
+
# 获取变量类型和设备类型
|
| 165 |
+
dtype, device = feature_maps[0].dtype, feature_maps[0].device
|
| 166 |
+
|
| 167 |
+
# one step in feature map equate n pixel stride in origin image
|
| 168 |
+
# 计算特征层上的一步等于原始图像上的步长
|
| 169 |
+
strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
|
| 170 |
+
torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
|
| 171 |
+
|
| 172 |
+
# 根据提供的sizes和aspect_ratios生成anchors模板
|
| 173 |
+
self.set_cell_anchors(dtype, device)
|
| 174 |
+
|
| 175 |
+
# 计算/读取所有anchors的坐标信息(这里的anchors信息是映射到原图上的所有anchors信息,不是anchors模板)
|
| 176 |
+
# 得到的是一个list列表,对应每张预测特征图映射回原图的anchors坐标信息
|
| 177 |
+
anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides)
|
| 178 |
+
|
| 179 |
+
anchors = torch.jit.annotate(List[List[torch.Tensor]], [])
|
| 180 |
+
# 遍历一个batch中的每张图像
|
| 181 |
+
for i, (image_height, image_width) in enumerate(image_list.image_sizes):
|
| 182 |
+
anchors_in_image = []
|
| 183 |
+
# 遍历每张预测特征图映射回原图的anchors坐标信息
|
| 184 |
+
for anchors_per_feature_map in anchors_over_all_feature_maps:
|
| 185 |
+
anchors_in_image.append(anchors_per_feature_map)
|
| 186 |
+
anchors.append(anchors_in_image)
|
| 187 |
+
# 将每一张图像的所有预测特征层的anchors坐标信息拼接在一起
|
| 188 |
+
# anchors是个list,每个元素为一张图像的所有anchors信息
|
| 189 |
+
anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
|
| 190 |
+
# Clear the cache in case that memory leaks.
|
| 191 |
+
self._cache.clear()
|
| 192 |
+
return anchors
|