HZSDU commited on
Commit
dfb6163
·
verified ·
1 Parent(s): 475273d

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +12 -0
  2. assets/1.jpg +3 -0
  3. assets/2.jpg +3 -0
  4. assets/3.jpg +3 -0
  5. assets/4.jpg +3 -0
  6. checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738985521.Iflight +3 -0
  7. checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995505.Iflight +3 -0
  8. checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995594.Iflight +3 -0
  9. checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995658.Iflight +3 -0
  10. checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742698.autodl-container-10a44fbcf4-a7468946 +3 -0
  11. checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742996.autodl-container-10a44fbcf4-a7468946 +3 -0
  12. checkpoints/imagenet/hole_benchmark/gen_00430000.pt +3 -0
  13. checkpoints/ostracoda_cyclegan/latest_net_D_A.pth +3 -0
  14. checkpoints/ostracoda_cyclegan/latest_net_D_B.pth +3 -0
  15. data/style/11.png +3 -0
  16. data/style/32.jpg +3 -0
  17. data/style/6.jpg +3 -0
  18. data/style/7.jpg +3 -0
  19. data/texture/16.jpg +3 -0
  20. data/texture/17.jpg +3 -0
  21. data/texture/4.jpg +3 -0
  22. data/texture/8.jpg +3 -0
  23. model/tokenizer/tokenizer_config.json +34 -0
  24. model/tokenizer/vocab.json +0 -0
  25. model/unet/config.json +36 -0
  26. model/vae/config.json +29 -0
  27. sddfrcnn_model/__pycache__/draw_box_utils.cpython-310.pyc +0 -0
  28. sddfrcnn_model/backbone/__init__.py +3 -0
  29. sddfrcnn_model/backbone/__pycache__/__init__.cpython-310.pyc +0 -0
  30. sddfrcnn_model/backbone/__pycache__/feature_pyramid_network.cpython-310.pyc +0 -0
  31. sddfrcnn_model/backbone/__pycache__/res50_backbone.cpython-310.pyc +0 -0
  32. sddfrcnn_model/backbone/__pycache__/resnet50_fpn_model.cpython-310.pyc +0 -0
  33. sddfrcnn_model/backbone/__pycache__/ssd_model.cpython-310.pyc +0 -0
  34. sddfrcnn_model/backbone/__pycache__/utils.cpython-310.pyc +0 -0
  35. sddfrcnn_model/backbone/feature_pyramid_network.py +283 -0
  36. sddfrcnn_model/backbone/res50_backbone.py +106 -0
  37. sddfrcnn_model/backbone/resnet50_fpn_model.py +199 -0
  38. sddfrcnn_model/backbone/ssd_model.py +225 -0
  39. sddfrcnn_model/backbone/utils.py +628 -0
  40. sddfrcnn_model/draw_box_utils.py +197 -0
  41. sddfrcnn_model/network_files/__init__.py +1 -0
  42. sddfrcnn_model/network_files/__pycache__/__init__.cpython-310.pyc +0 -0
  43. sddfrcnn_model/network_files/__pycache__/anchor_utils.cpython-310.pyc +0 -0
  44. sddfrcnn_model/network_files/__pycache__/boxes.cpython-310.pyc +0 -0
  45. sddfrcnn_model/network_files/__pycache__/det_utils.cpython-310.pyc +0 -0
  46. sddfrcnn_model/network_files/__pycache__/image_list.cpython-310.pyc +0 -0
  47. sddfrcnn_model/network_files/__pycache__/losses.cpython-310.pyc +0 -0
  48. sddfrcnn_model/network_files/__pycache__/retinanet.cpython-310.pyc +0 -0
  49. sddfrcnn_model/network_files/__pycache__/transform.cpython-310.pyc +0 -0
  50. sddfrcnn_model/network_files/anchor_utils.py +192 -0
.gitattributes CHANGED
@@ -65,3 +65,15 @@ data/style/5.jpg filter=lfs diff=lfs merge=lfs -text
65
  data/style/59.png filter=lfs diff=lfs merge=lfs -text
66
  data/texture/14.jpg filter=lfs diff=lfs merge=lfs -text
67
  data/texture/15.jpg filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  data/style/59.png filter=lfs diff=lfs merge=lfs -text
66
  data/texture/14.jpg filter=lfs diff=lfs merge=lfs -text
67
  data/texture/15.jpg filter=lfs diff=lfs merge=lfs -text
68
+ data/texture/16.jpg filter=lfs diff=lfs merge=lfs -text
69
+ data/texture/4.jpg filter=lfs diff=lfs merge=lfs -text
70
+ data/style/7.jpg filter=lfs diff=lfs merge=lfs -text
71
+ data/style/11.png filter=lfs diff=lfs merge=lfs -text
72
+ data/texture/8.jpg filter=lfs diff=lfs merge=lfs -text
73
+ data/style/6.jpg filter=lfs diff=lfs merge=lfs -text
74
+ data/style/32.jpg filter=lfs diff=lfs merge=lfs -text
75
+ data/texture/17.jpg filter=lfs diff=lfs merge=lfs -text
76
+ assets/2.jpg filter=lfs diff=lfs merge=lfs -text
77
+ assets/3.jpg filter=lfs diff=lfs merge=lfs -text
78
+ assets/4.jpg filter=lfs diff=lfs merge=lfs -text
79
+ assets/1.jpg filter=lfs diff=lfs merge=lfs -text
assets/1.jpg ADDED

Git LFS Details

  • SHA256: f112aa97b1e8ea91953685b7d7e815f611dbebcadac6aaf6cb8efc7539fb4cea
  • Pointer size: 132 Bytes
  • Size of remote file: 4.3 MB
assets/2.jpg ADDED

Git LFS Details

  • SHA256: 439bf0f3f75d45188122f7351163ec939d13d2a21ac86ab8c7964e6b0ac5d098
  • Pointer size: 132 Bytes
  • Size of remote file: 2.41 MB
assets/3.jpg ADDED

Git LFS Details

  • SHA256: 155507d2603f2afd797afb6f07a7cd533767620e55875893735d60718fd6688c
  • Pointer size: 132 Bytes
  • Size of remote file: 3.93 MB
assets/4.jpg ADDED

Git LFS Details

  • SHA256: 898d8c33ab6f3ae1be79fead16ac261b3950c065bd2f308035d887b5666ca179
  • Pointer size: 132 Bytes
  • Size of remote file: 2.54 MB
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738985521.Iflight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:970514f929b756e7026179fd443b21eff57e61901c16d3bbd3af81afe0de53dd
3
+ size 40
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995505.Iflight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db215c86a3dfe491ef28a21766d485c84edf70099f98464be9fa1cddd3f4e633
3
+ size 40
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995594.Iflight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bde18faa073f516a72bbdd40bb5e08e5fb8da56914455612b49d9c8d85b3cca8
3
+ size 40
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995658.Iflight ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7127eee0799d2360a417549a82c10cc3b12ec09f9015495257ec92e55383894
3
+ size 40
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742698.autodl-container-10a44fbcf4-a7468946 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f75095a66582caded1c54a9fa7cc3e7edc13d3e9e17a559929cace1f64f6f7e2
3
+ size 40
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742996.autodl-container-10a44fbcf4-a7468946 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79c1f85ed120d2ac696dfa584e7368b9a5eae288f5e6d938ff68788146279d5c
3
+ size 152845
checkpoints/imagenet/hole_benchmark/gen_00430000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee688f6cf0649a0eeea9c4623719eeab52bf39f2a5f2dabf80cbcf1995f289b3
3
+ size 14443538
checkpoints/ostracoda_cyclegan/latest_net_D_A.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a52376b4c7fdb72089e48a3aa1e9c6f3f26576dba68c01f058643baf4506944
3
+ size 11063002
checkpoints/ostracoda_cyclegan/latest_net_D_B.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7894e2f5f98edad44b0eb0202ce5b6f64669a0436099caaf36719ea2a8e963eb
3
+ size 11063002
data/style/11.png ADDED

Git LFS Details

  • SHA256: 9b79ca19fe8ce49287186cb749f881fa3702313b1bb672fbda9208e22c5f4733
  • Pointer size: 132 Bytes
  • Size of remote file: 1.49 MB
data/style/32.jpg ADDED

Git LFS Details

  • SHA256: 80c208e2563530e15344b6b5be65839e0d9451baa306ee80efd7304f61eeae58
  • Pointer size: 131 Bytes
  • Size of remote file: 853 kB
data/style/6.jpg ADDED

Git LFS Details

  • SHA256: fc10e4005cf93c94ed5e02b536efc21f79725c99efabd52bddd05ac94e7821ca
  • Pointer size: 131 Bytes
  • Size of remote file: 189 kB
data/style/7.jpg ADDED

Git LFS Details

  • SHA256: cb963419f2601053fc0198bff30b331cb04fcb82ed73787d6b8132f2d44628fc
  • Pointer size: 131 Bytes
  • Size of remote file: 399 kB
data/texture/16.jpg ADDED

Git LFS Details

  • SHA256: b449c7cbc71da8e6c90250b89bd4f8147198c950984e043662b67fe6a201c35d
  • Pointer size: 131 Bytes
  • Size of remote file: 194 kB
data/texture/17.jpg ADDED

Git LFS Details

  • SHA256: 8333185aeb3ad656c461c6ac6926187fb9b0adef69b267a512e3add788940ec5
  • Pointer size: 131 Bytes
  • Size of remote file: 391 kB
data/texture/4.jpg ADDED

Git LFS Details

  • SHA256: 638b93cb3a43b5ff4f2c92097928a1822afc4cf72a985b5fede6dc245cc82615
  • Pointer size: 131 Bytes
  • Size of remote file: 312 kB
data/texture/8.jpg ADDED

Git LFS Details

  • SHA256: c145ee768560ed8848967c30caaab4dbb75e73bc6e10d8622502ef7966ea04e7
  • Pointer size: 131 Bytes
  • Size of remote file: 111 kB
model/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "bos_token": {
4
+ "__type": "AddedToken",
5
+ "content": "<|startoftext|>",
6
+ "lstrip": false,
7
+ "normalized": true,
8
+ "rstrip": false,
9
+ "single_word": false
10
+ },
11
+ "do_lower_case": true,
12
+ "eos_token": {
13
+ "__type": "AddedToken",
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "errors": "replace",
21
+ "model_max_length": 77,
22
+ "name_or_path": "openai/clip-vit-large-patch14",
23
+ "pad_token": "<|endoftext|>",
24
+ "special_tokens_map_file": "./special_tokens_map.json",
25
+ "tokenizer_class": "CLIPTokenizer",
26
+ "unk_token": {
27
+ "__type": "AddedToken",
28
+ "content": "<|endoftext|>",
29
+ "lstrip": false,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ }
34
+ }
model/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
model/unet/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.6.0",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "cross_attention_dim": 768,
14
+ "down_block_types": [
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "CrossAttnDownBlock2D",
18
+ "DownBlock2D"
19
+ ],
20
+ "downsample_padding": 1,
21
+ "flip_sin_to_cos": true,
22
+ "freq_shift": 0,
23
+ "in_channels": 4,
24
+ "layers_per_block": 2,
25
+ "mid_block_scale_factor": 1,
26
+ "norm_eps": 1e-05,
27
+ "norm_num_groups": 32,
28
+ "out_channels": 4,
29
+ "sample_size": 64,
30
+ "up_block_types": [
31
+ "UpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D",
34
+ "CrossAttnUpBlock2D"
35
+ ]
36
+ }
model/vae/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.6.0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "in_channels": 3,
18
+ "latent_channels": 4,
19
+ "layers_per_block": 2,
20
+ "norm_num_groups": 32,
21
+ "out_channels": 3,
22
+ "sample_size": 512,
23
+ "up_block_types": [
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D",
27
+ "UpDecoderBlock2D"
28
+ ]
29
+ }
sddfrcnn_model/__pycache__/draw_box_utils.cpython-310.pyc ADDED
Binary file (5.2 kB). View file
 
sddfrcnn_model/backbone/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .feature_pyramid_network import FeaturePyramidNetwork, LastLevelP6P7, LastLevelMaxPool
2
+ from .resnet50_fpn_model import resnet50_fpn_backbone
3
+ from .ssd_model import SSD300,Backbone
sddfrcnn_model/backbone/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (407 Bytes). View file
 
sddfrcnn_model/backbone/__pycache__/feature_pyramid_network.cpython-310.pyc ADDED
Binary file (10.1 kB). View file
 
sddfrcnn_model/backbone/__pycache__/res50_backbone.cpython-310.pyc ADDED
Binary file (3.27 kB). View file
 
sddfrcnn_model/backbone/__pycache__/resnet50_fpn_model.cpython-310.pyc ADDED
Binary file (6.26 kB). View file
 
sddfrcnn_model/backbone/__pycache__/ssd_model.cpython-310.pyc ADDED
Binary file (6.61 kB). View file
 
sddfrcnn_model/backbone/__pycache__/utils.cpython-310.pyc ADDED
Binary file (14.9 kB). View file
 
sddfrcnn_model/backbone/feature_pyramid_network.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import OrderedDict
2
+
3
+ import torch.nn as nn
4
+ import torch
5
+ from torch import Tensor
6
+ import torch.nn.functional as F
7
+
8
+ from torch.jit.annotations import Tuple, List, Dict
9
+
10
+
11
+ class IntermediateLayerGetter(nn.ModuleDict):
12
+ """
13
+ Module wrapper that returns intermediate layers from a model
14
+ It has a strong assumption that the modules have been registered
15
+ into the model in the same order as they are used.
16
+ This means that one should **not** reuse the same nn.Module
17
+ twice in the forward if you want this to work.
18
+ Additionally, it is only able to query submodules that are directly
19
+ assigned to the model. So if `model` is passed, `model.feature1` can
20
+ be returned, but not `model.feature1.layer2`.
21
+ Arguments:
22
+ model (nn.Module): model on which we will extract the features
23
+ return_layers (Dict[name, new_name]): a dict containing the names
24
+ of the modules for which the activations will be returned as
25
+ the key of the dict, and the value of the dict is the name
26
+ of the returned activation (which the user can specify).
27
+ """
28
+ __annotations__ = {
29
+ "return_layers": Dict[str, str],
30
+ }
31
+
32
+ def __init__(self, model, return_layers):
33
+ if not set(return_layers).issubset([name for name, _ in model.named_children()]):
34
+ raise ValueError("return_layers are not present in model")
35
+
36
+ orig_return_layers = return_layers
37
+ return_layers = {str(k): str(v) for k, v in return_layers.items()}
38
+ layers = OrderedDict()
39
+
40
+ # 遍历模型子模块按顺序存入有序字典
41
+ # 只保存layer4及其之前的结构,舍去之后不用的结构
42
+ for name, module in model.named_children():
43
+ layers[name] = module
44
+ if name in return_layers:
45
+ del return_layers[name]
46
+ if not return_layers:
47
+ break
48
+
49
+ super().__init__(layers)
50
+ self.return_layers = orig_return_layers
51
+
52
+ def forward(self, x):
53
+ out = OrderedDict()
54
+ # 依次遍历模型的所有子模块,并进行正向传播,
55
+ # 收集layer1, layer2, layer3, layer4的输出
56
+ for name, module in self.items():
57
+ x = module(x)
58
+ if name in self.return_layers:
59
+ out_name = self.return_layers[name]
60
+ out[out_name] = x
61
+ return out
62
+
63
+
64
+ class BackboneWithFPN(nn.Module):
65
+ """
66
+ Adds a FPN on top of a model.
67
+ Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
68
+ extract a submodel that returns the feature maps specified in return_layers.
69
+ The same limitations of IntermediatLayerGetter apply here.
70
+ Arguments:
71
+ backbone (nn.Module)
72
+ return_layers (Dict[name, new_name]): a dict containing the names
73
+ of the modules for which the activations will be returned as
74
+ the key of the dict, and the value of the dict is the name
75
+ of the returned activation (which the user can specify).
76
+ in_channels_list (List[int]): number of channels for each feature map
77
+ that is returned, in the order they are present in the OrderedDict
78
+ out_channels (int): number of channels in the FPN.
79
+ extra_blocks: ExtraFPNBlock
80
+ Attributes:
81
+ out_channels (int): the number of channels in the FPN
82
+ """
83
+
84
+ def __init__(self,
85
+ backbone: nn.Module,
86
+ return_layers=None,
87
+ in_channels_list=None,
88
+ out_channels=256,
89
+ extra_blocks=None,
90
+ re_getter=True):
91
+ super().__init__()
92
+
93
+ if extra_blocks is None:
94
+ extra_blocks = LastLevelMaxPool()
95
+
96
+ if re_getter:
97
+ assert return_layers is not None
98
+ self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
99
+ else:
100
+ self.body = backbone
101
+
102
+ self.fpn = FeaturePyramidNetwork(
103
+ in_channels_list=in_channels_list,
104
+ out_channels=out_channels,
105
+ extra_blocks=extra_blocks,
106
+ )
107
+
108
+ self.out_channels = out_channels
109
+
110
+ def forward(self, x):
111
+ x = self.body(x)
112
+ x = self.fpn(x)
113
+ return x
114
+
115
+
116
+ class ExtraFPNBlock(nn.Module):
117
+ """
118
+ Base class for the extra block in the FPN.
119
+
120
+ Args:
121
+ results (List[Tensor]): the result of the FPN
122
+ x (List[Tensor]): the original feature maps
123
+ names (List[str]): the names for each one of the
124
+ original feature maps
125
+
126
+ Returns:
127
+ results (List[Tensor]): the extended set of results
128
+ of the FPN
129
+ names (List[str]): the extended set of names for the results
130
+ """
131
+ def forward(self,
132
+ results: List[Tensor],
133
+ x: List[Tensor],
134
+ names: List[str]) -> Tuple[List[Tensor], List[str]]:
135
+ pass
136
+
137
+
138
+ class LastLevelMaxPool(torch.nn.Module):
139
+ """
140
+ Applies a max_pool2d on top of the last feature map
141
+ """
142
+
143
+ def forward(self, x: List[Tensor], y: List[Tensor], names: List[str]) -> Tuple[List[Tensor], List[str]]:
144
+ names.append("pool")
145
+ x.append(F.max_pool2d(x[-1], 1, 2, 0))
146
+ return x, names
147
+
148
+
149
+ class LastLevelP6P7(ExtraFPNBlock):
150
+ """
151
+ This module is used in RetinaNet to generate extra layers, P6 and P7.
152
+ """
153
+ def __init__(self, in_channels: int, out_channels: int):
154
+ super().__init__()
155
+ self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
156
+ self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
157
+ for module in [self.p6, self.p7]:
158
+ nn.init.kaiming_uniform_(module.weight, a=1)
159
+ nn.init.constant_(module.bias, 0)
160
+ self.use_P5 = in_channels == out_channels
161
+
162
+ def forward(self,
163
+ p: List[Tensor],
164
+ c: List[Tensor],
165
+ names: List[str]) -> Tuple[List[Tensor], List[str]]:
166
+ p5, c5 = p[-1], c[-1]
167
+ x = p5 if self.use_P5 else c5
168
+ p6 = self.p6(x)
169
+ p7 = self.p7(F.relu(p6))
170
+ p.extend([p6, p7])
171
+ names.extend(["p6", "p7"])
172
+ return p, names
173
+
174
+
175
+ class FeaturePyramidNetwork(nn.Module):
176
+ """
177
+ Module that adds a FPN from on top of a set of feature maps. This is based on
178
+ `"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
179
+ The feature maps are currently supposed to be in increasing depth
180
+ order.
181
+ The input to the model is expected to be an OrderedDict[Tensor], containing
182
+ the feature maps on top of which the FPN will be added.
183
+ Arguments:
184
+ in_channels_list (list[int]): number of channels for each feature map that
185
+ is passed to the module
186
+ out_channels (int): number of channels of the FPN representation
187
+ extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
188
+ be performed. It is expected to take the fpn features, the original
189
+ features and the names of the original features as input, and returns
190
+ a new list of feature maps and their corresponding names
191
+ """
192
+
193
+ def __init__(self, in_channels_list, out_channels, extra_blocks=None):
194
+ super().__init__()
195
+ # 用来调整resnet特征矩阵(layer1,2,3,4)的channel(kernel_size=1)
196
+ self.inner_blocks = nn.ModuleList()
197
+ # 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵
198
+ self.layer_blocks = nn.ModuleList()
199
+ for in_channels in in_channels_list:
200
+ if in_channels == 0:
201
+ continue
202
+ inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
203
+ layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
204
+ self.inner_blocks.append(inner_block_module)
205
+ self.layer_blocks.append(layer_block_module)
206
+
207
+ # initialize parameters now to avoid modifying the initialization of top_blocks
208
+ for m in self.children():
209
+ if isinstance(m, nn.Conv2d):
210
+ nn.init.kaiming_uniform_(m.weight, a=1)
211
+ nn.init.constant_(m.bias, 0)
212
+
213
+ self.extra_blocks = extra_blocks
214
+
215
+ def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
216
+ """
217
+ This is equivalent to self.inner_blocks[idx](x),
218
+ but torchscript doesn't support this yet
219
+ """
220
+ num_blocks = len(self.inner_blocks)
221
+ if idx < 0:
222
+ idx += num_blocks
223
+ i = 0
224
+ out = x
225
+ for module in self.inner_blocks:
226
+ if i == idx:
227
+ out = module(x)
228
+ i += 1
229
+ return out
230
+
231
+ def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
232
+ """
233
+ This is equivalent to self.layer_blocks[idx](x),
234
+ but torchscript doesn't support this yet
235
+ """
236
+ num_blocks = len(self.layer_blocks)
237
+ if idx < 0:
238
+ idx += num_blocks
239
+ i = 0
240
+ out = x
241
+ for module in self.layer_blocks:
242
+ if i == idx:
243
+ out = module(x)
244
+ i += 1
245
+ return out
246
+
247
+ def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
248
+ """
249
+ Computes the FPN for a set of feature maps.
250
+ Arguments:
251
+ x (OrderedDict[Tensor]): feature maps for each feature level.
252
+ Returns:
253
+ results (OrderedDict[Tensor]): feature maps after FPN layers.
254
+ They are ordered from highest resolution first.
255
+ """
256
+ # unpack OrderedDict into two lists for easier handling
257
+ names = list(x.keys())
258
+ x = list(x.values())
259
+
260
+ # 将resnet layer4的channel调整到指定的out_channels
261
+ # last_inner = self.inner_blocks[-1](x[-1])
262
+ last_inner = self.get_result_from_inner_blocks(x[-1], -1)
263
+ # result中保存着每个预测特征层
264
+ results = []
265
+ # 将layer4调整channel后的特征矩阵,通过3x3卷积后得到对应的预测特征矩阵
266
+ # results.append(self.layer_blocks[-1](last_inner))
267
+ results.append(self.get_result_from_layer_blocks(last_inner, -1))
268
+
269
+ for idx in range(len(x) - 2, -1, -1):
270
+ inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
271
+ feat_shape = inner_lateral.shape[-2:]
272
+ inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
273
+ last_inner = inner_lateral + inner_top_down
274
+ results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
275
+
276
+ # 在layer4对应的预测特征层基础上生成预测特征矩阵5
277
+ if self.extra_blocks is not None:
278
+ results, names = self.extra_blocks(results, x, names)
279
+
280
+ # make it back an OrderedDict
281
+ out = OrderedDict([(k, v) for k, v in zip(names, results)])
282
+
283
+ return out
sddfrcnn_model/backbone/res50_backbone.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+
4
+
5
+ class Bottleneck(nn.Module):
6
+ expansion = 4
7
+
8
+ def __init__(self, in_channel, out_channel, stride=1, downsample=None):
9
+ super(Bottleneck, self).__init__()
10
+ self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
11
+ kernel_size=1, stride=1, bias=False) # squeeze channels
12
+ self.bn1 = nn.BatchNorm2d(out_channel)
13
+ # -----------------------------------------
14
+ self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
15
+ kernel_size=3, stride=stride, bias=False, padding=1)
16
+ self.bn2 = nn.BatchNorm2d(out_channel)
17
+ # -----------------------------------------
18
+ self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel*self.expansion,
19
+ kernel_size=1, stride=1, bias=False) # unsqueeze channels
20
+ self.bn3 = nn.BatchNorm2d(out_channel*self.expansion)
21
+ self.relu = nn.ReLU(inplace=True)
22
+ self.downsample = downsample
23
+
24
+ def forward(self, x):
25
+ identity = x
26
+ if self.downsample is not None:
27
+ identity = self.downsample(x)
28
+
29
+ out = self.conv1(x)
30
+ out = self.bn1(out)
31
+ out = self.relu(out)
32
+
33
+ out = self.conv2(out)
34
+ out = self.bn2(out)
35
+ out = self.relu(out)
36
+
37
+ out = self.conv3(out)
38
+ out = self.bn3(out)
39
+
40
+ out += identity
41
+ out = self.relu(out)
42
+
43
+ return out
44
+
45
+
46
+ class ResNet(nn.Module):
47
+
48
+ def __init__(self, block, blocks_num, num_classes=1000, include_top=True):
49
+ super(ResNet, self).__init__()
50
+ self.include_top = include_top
51
+ self.in_channel = 64
52
+
53
+ self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
54
+ padding=3, bias=False)
55
+ self.bn1 = nn.BatchNorm2d(self.in_channel)
56
+ self.relu = nn.ReLU(inplace=True)
57
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
58
+ self.layer1 = self._make_layer(block, 64, blocks_num[0])
59
+ self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
60
+ self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
61
+ self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
62
+ if self.include_top:
63
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1)
64
+ self.fc = nn.Linear(512 * block.expansion, num_classes)
65
+
66
+ for m in self.modules():
67
+ if isinstance(m, nn.Conv2d):
68
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
69
+
70
+ def _make_layer(self, block, channel, block_num, stride=1):
71
+ downsample = None
72
+ if stride != 1 or self.in_channel != channel * block.expansion:
73
+ downsample = nn.Sequential(
74
+ nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
75
+ nn.BatchNorm2d(channel * block.expansion))
76
+
77
+ layers = []
78
+ layers.append(block(self.in_channel, channel, downsample=downsample, stride=stride))
79
+ self.in_channel = channel * block.expansion
80
+
81
+ for _ in range(1, block_num):
82
+ layers.append(block(self.in_channel, channel))
83
+
84
+ return nn.Sequential(*layers)
85
+
86
+ def forward(self, x):
87
+ x = self.conv1(x)
88
+ x = self.bn1(x)
89
+ x = self.relu(x)
90
+ x = self.maxpool(x)
91
+
92
+ x = self.layer1(x)
93
+ x = self.layer2(x)
94
+ x = self.layer3(x)
95
+ x = self.layer4(x)
96
+
97
+ if self.include_top:
98
+ x = self.avgpool(x)
99
+ x = torch.flatten(x, 1)
100
+ x = self.fc(x)
101
+
102
+ return x
103
+
104
+
105
+ def resnet50(num_classes=1000, include_top=True):
106
+ return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, include_top=include_top)
sddfrcnn_model/backbone/resnet50_fpn_model.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch.nn as nn
4
+ import torch
5
+ from torchvision.ops.misc import FrozenBatchNorm2d
6
+
7
+ from .feature_pyramid_network import LastLevelMaxPool, BackboneWithFPN
8
+
9
+
10
+ class Bottleneck(nn.Module):
11
+ expansion = 4
12
+
13
+ def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
14
+ super().__init__()
15
+ if norm_layer is None:
16
+ norm_layer = nn.BatchNorm2d
17
+
18
+ self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
19
+ kernel_size=1, stride=1, bias=False) # squeeze channels
20
+ self.bn1 = norm_layer(out_channel)
21
+ # -----------------------------------------
22
+ self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
23
+ kernel_size=3, stride=stride, bias=False, padding=1)
24
+ self.bn2 = norm_layer(out_channel)
25
+ # -----------------------------------------
26
+ self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion,
27
+ kernel_size=1, stride=1, bias=False) # unsqueeze channels
28
+ self.bn3 = norm_layer(out_channel * self.expansion)
29
+ self.relu = nn.ReLU(inplace=True)
30
+ self.downsample = downsample
31
+
32
+ def forward(self, x):
33
+ identity = x
34
+ if self.downsample is not None:
35
+ identity = self.downsample(x)
36
+
37
+ out = self.conv1(x)
38
+ out = self.bn1(out)
39
+ out = self.relu(out)
40
+
41
+ out = self.conv2(out)
42
+ out = self.bn2(out)
43
+ out = self.relu(out)
44
+
45
+ out = self.conv3(out)
46
+ out = self.bn3(out)
47
+
48
+ out += identity
49
+ out = self.relu(out)
50
+
51
+ return out
52
+
53
+
54
+ class ResNet(nn.Module):
55
+
56
+ def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
57
+ super().__init__()
58
+ if norm_layer is None:
59
+ norm_layer = nn.BatchNorm2d
60
+ self._norm_layer = norm_layer
61
+
62
+ self.include_top = include_top
63
+ self.in_channel = 64
64
+
65
+ self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
66
+ padding=3, bias=False)
67
+ self.bn1 = norm_layer(self.in_channel)
68
+ self.relu = nn.ReLU(inplace=True)
69
+ self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
70
+ self.layer1 = self._make_layer(block, 64, blocks_num[0])
71
+ self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
72
+ self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
73
+ self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
74
+ if self.include_top:
75
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) # output size = (1, 1)
76
+ self.fc = nn.Linear(512 * block.expansion, num_classes)
77
+
78
+ for m in self.modules():
79
+ if isinstance(m, nn.Conv2d):
80
+ nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
81
+
82
+ def _make_layer(self, block, channel, block_num, stride=1):
83
+ norm_layer = self._norm_layer
84
+ downsample = None
85
+ if stride != 1 or self.in_channel != channel * block.expansion:
86
+ downsample = nn.Sequential(
87
+ nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
88
+ norm_layer(channel * block.expansion))
89
+
90
+ layers = []
91
+ layers.append(block(self.in_channel, channel, downsample=downsample,
92
+ stride=stride, norm_layer=norm_layer))
93
+ self.in_channel = channel * block.expansion
94
+
95
+ for _ in range(1, block_num):
96
+ layers.append(block(self.in_channel, channel, norm_layer=norm_layer))
97
+
98
+ return nn.Sequential(*layers)
99
+
100
+ def forward(self, x):
101
+ x = self.conv1(x)
102
+ x = self.bn1(x)
103
+ x = self.relu(x)
104
+ x = self.maxpool(x)
105
+
106
+ x = self.layer1(x)
107
+ x = self.layer2(x)
108
+ x = self.layer3(x)
109
+ x = self.layer4(x)
110
+
111
+ if self.include_top:
112
+ x = self.avgpool(x)
113
+ x = torch.flatten(x, 1)
114
+ x = self.fc(x)
115
+
116
+ return x
117
+
118
+
119
+ def overwrite_eps(model, eps):
120
+ """
121
+ This method overwrites the default eps values of all the
122
+ FrozenBatchNorm2d layers of the model with the provided value.
123
+ This is necessary to address the BC-breaking change introduced
124
+ by the bug-fix at pytorch/vision#2933. The overwrite is applied
125
+ only when the pretrained weights are loaded to maintain compatibility
126
+ with previous versions.
127
+
128
+ Args:
129
+ model (nn.Module): The model on which we perform the overwrite.
130
+ eps (float): The new value of eps.
131
+ """
132
+ for module in model.modules():
133
+ if isinstance(module, FrozenBatchNorm2d):
134
+ module.eps = eps
135
+
136
+
137
+ def resnet50_fpn_backbone(pretrain_path="",
138
+ norm_layer=FrozenBatchNorm2d, # FrozenBatchNorm2d的功能与BatchNorm2d类似,但参数无法更新
139
+ trainable_layers=3,
140
+ returned_layers=None,
141
+ extra_blocks=None):
142
+ """
143
+ 搭建resnet50_fpn——backbone
144
+ Args:
145
+ pretrain_path: resnet50的预训练权重,如果不使用就默认为空
146
+ norm_layer: 官方默认的是FrozenBatchNorm2d,即不会更新参数的bn层(因为如果batch_size设置的很小会导致效果更差,还不如不用bn层)
147
+ 如果自己的GPU显存很大可以设置很大的batch_size,那么自己可以传入正常的BatchNorm2d层
148
+ (https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
149
+ trainable_layers: 指定训练哪些层结构
150
+ returned_layers: 指定哪些层的输出需要返回
151
+ extra_blocks: 在输出的特征层基础上额外添加的层结构
152
+
153
+ Returns:
154
+
155
+ """
156
+ resnet_backbone = ResNet(Bottleneck, [3, 4, 6, 3],
157
+ include_top=False,
158
+ norm_layer=norm_layer)
159
+
160
+ if isinstance(norm_layer, FrozenBatchNorm2d):
161
+ overwrite_eps(resnet_backbone, 0.0)
162
+
163
+ if pretrain_path != "":
164
+ assert os.path.exists(pretrain_path), "{} is not exist.".format(pretrain_path)
165
+ # 载入预训练权重
166
+ print(resnet_backbone.load_state_dict(torch.load(pretrain_path), strict=False))
167
+
168
+ # select layers that wont be frozen
169
+ assert 0 <= trainable_layers <= 5
170
+ layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers]
171
+
172
+ # 如果要训练所有层结构的话,不要忘了conv1后还有一个bn1
173
+ if trainable_layers == 5:
174
+ layers_to_train.append("bn1")
175
+
176
+ # freeze layers
177
+ for name, parameter in resnet_backbone.named_parameters():
178
+ # 只训练不在layers_to_train列表中的层结构
179
+ if all([not name.startswith(layer) for layer in layers_to_train]):
180
+ parameter.requires_grad_(False)
181
+
182
+ if extra_blocks is None:
183
+ extra_blocks = LastLevelMaxPool()
184
+
185
+ if returned_layers is None:
186
+ returned_layers = [1, 2, 3, 4]
187
+ # 返回的特征层个数肯定大于0小于5
188
+ assert min(returned_layers) > 0 and max(returned_layers) < 5
189
+
190
+ # return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
191
+ return_layers = {f'layer{k}': str(v) for v, k in enumerate(returned_layers)}
192
+
193
+ # in_channel 为layer4的输出特征矩阵channel = 2048
194
+ in_channels_stage2 = resnet_backbone.in_channel // 8 # 256
195
+ # 记录resnet50提供给fpn的特征层channels
196
+ in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
197
+ # 通过fpn后得到的每个特征层的channel
198
+ out_channels = 256
199
+ return BackboneWithFPN(resnet_backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)
sddfrcnn_model/backbone/ssd_model.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn, Tensor
3
+ from torch.jit.annotations import List
4
+
5
+ from .res50_backbone import resnet50
6
+ from .utils import dboxes300_coco, Encoder, PostProcess
7
+
8
+
9
+ class Backbone(nn.Module):
10
+ def __init__(self, pretrain_path=None):
11
+ super(Backbone, self).__init__()
12
+ net = resnet50()
13
+ self.out_channels = [1024, 512, 512, 256, 256, 256]
14
+
15
+ if pretrain_path is not None:
16
+ net.load_state_dict(torch.load(pretrain_path))
17
+
18
+ self.feature_extractor = nn.Sequential(*list(net.children())[:7])
19
+
20
+ conv4_block1 = self.feature_extractor[-1][0]
21
+
22
+ # 修改conv4_block1的步距,从2->1
23
+ conv4_block1.conv1.stride = (1, 1)
24
+ conv4_block1.conv2.stride = (1, 1)
25
+ conv4_block1.downsample[0].stride = (1, 1)
26
+
27
+ def forward(self, x):
28
+ x = self.feature_extractor(x)
29
+ return x
30
+
31
+
32
+ class SSD300(nn.Module):
33
+ def __init__(self, backbone=None, num_classes=21):
34
+ super(SSD300, self).__init__()
35
+ if backbone is None:
36
+ raise Exception("backbone is None")
37
+ if not hasattr(backbone, "out_channels"):
38
+ raise Exception("the backbone not has attribute: out_channel")
39
+ self.feature_extractor = backbone
40
+
41
+ self.num_classes = num_classes
42
+ # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
43
+ self._build_additional_features(self.feature_extractor.out_channels)
44
+ self.num_defaults = [4, 6, 6, 6, 4, 4]
45
+ location_extractors = []
46
+ confidence_extractors = []
47
+
48
+ # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
49
+ for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
50
+ # nd is number_default_boxes, oc is output_channel
51
+ location_extractors.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
52
+ confidence_extractors.append(nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))
53
+
54
+ self.loc = nn.ModuleList(location_extractors)
55
+ self.conf = nn.ModuleList(confidence_extractors)
56
+ self._init_weights()
57
+
58
+ default_box = dboxes300_coco()
59
+ self.compute_loss = Loss(default_box)
60
+ self.encoder = Encoder(default_box)
61
+ self.postprocess = PostProcess(default_box)
62
+
63
+ def _build_additional_features(self, input_size):
64
+ """
65
+ 为backbone(resnet50)添加额外的一系列卷积层,得到相应的一系列特征提取器
66
+ :param input_size:
67
+ :return:
68
+ """
69
+ additional_blocks = []
70
+ # input_size = [1024, 512, 512, 256, 256, 256] for resnet50
71
+ middle_channels = [256, 256, 128, 128, 128]
72
+ for i, (input_ch, output_ch, middle_ch) in enumerate(zip(input_size[:-1], input_size[1:], middle_channels)):
73
+ padding, stride = (1, 2) if i < 3 else (0, 1)
74
+ layer = nn.Sequential(
75
+ nn.Conv2d(input_ch, middle_ch, kernel_size=1, bias=False),
76
+ nn.BatchNorm2d(middle_ch),
77
+ nn.ReLU(inplace=True),
78
+ nn.Conv2d(middle_ch, output_ch, kernel_size=3, padding=padding, stride=stride, bias=False),
79
+ nn.BatchNorm2d(output_ch),
80
+ nn.ReLU(inplace=True),
81
+ )
82
+ additional_blocks.append(layer)
83
+ self.additional_blocks = nn.ModuleList(additional_blocks)
84
+
85
+ def _init_weights(self):
86
+ layers = [*self.additional_blocks, *self.loc, *self.conf]
87
+ for layer in layers:
88
+ for param in layer.parameters():
89
+ if param.dim() > 1:
90
+ nn.init.xavier_uniform_(param)
91
+
92
+ # Shape the classifier to the view of bboxes
93
+ def bbox_view(self, features, loc_extractor, conf_extractor):
94
+ locs = []
95
+ confs = []
96
+ for f, l, c in zip(features, loc_extractor, conf_extractor):
97
+ # [batch, n*4, feat_size, feat_size] -> [batch, 4, -1]
98
+ locs.append(l(f).view(f.size(0), 4, -1))
99
+ # [batch, n*classes, feat_size, feat_size] -> [batch, classes, -1]
100
+ confs.append(c(f).view(f.size(0), self.num_classes, -1))
101
+
102
+ locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
103
+ return locs, confs
104
+
105
+ def forward(self, image, targets=None):
106
+ x = self.feature_extractor(image)
107
+
108
+ # Feature Map 38x38x1024, 19x19x512, 10x10x512, 5x5x256, 3x3x256, 1x1x256
109
+ detection_features = torch.jit.annotate(List[Tensor], []) # [x]
110
+ detection_features.append(x)
111
+ for layer in self.additional_blocks:
112
+ x = layer(x)
113
+ detection_features.append(x)
114
+
115
+ # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
116
+ locs, confs = self.bbox_view(detection_features, self.loc, self.conf)
117
+
118
+ # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
119
+ # 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732
120
+
121
+ if self.training:
122
+ if targets is None:
123
+ raise ValueError("In training mode, targets should be passed")
124
+ # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
125
+ bboxes_out = targets['boxes']
126
+ bboxes_out = bboxes_out.transpose(1, 2).contiguous()
127
+ # print(bboxes_out.is_contiguous())
128
+ labels_out = targets['labels']
129
+ # print(labels_out.is_contiguous())
130
+
131
+ # ploc, plabel, gloc, glabel
132
+ loss = self.compute_loss(locs, confs, bboxes_out, labels_out)
133
+ return {"total_losses": loss}
134
+
135
+ # 将预测回归参数叠加到default box上得到最终预测box,并执行非极大值抑制虑除重叠框
136
+ # results = self.encoder.decode_batch(locs, confs)
137
+ results = self.postprocess(locs, confs)
138
+ return results
139
+
140
+
141
+ class Loss(nn.Module):
142
+ """
143
+ Implements the loss as the sum of the followings:
144
+ 1. Confidence Loss: All labels, with hard negative mining
145
+ 2. Localization Loss: Only on positive labels
146
+ Suppose input dboxes has the shape 8732x4
147
+ """
148
+ def __init__(self, dboxes):
149
+ super(Loss, self).__init__()
150
+ # Two factor are from following links
151
+ # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
152
+ self.scale_xy = 1.0 / dboxes.scale_xy # 10
153
+ self.scale_wh = 1.0 / dboxes.scale_wh # 5
154
+
155
+ self.location_loss = nn.SmoothL1Loss(reduction='none')
156
+ # [num_anchors, 4] -> [4, num_anchors] -> [1, 4, num_anchors]
157
+ self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0),
158
+ requires_grad=False)
159
+
160
+ self.confidence_loss = nn.CrossEntropyLoss(reduction='none')
161
+
162
+ def _location_vec(self, loc):
163
+ # type: (Tensor) -> Tensor
164
+ """
165
+ Generate Location Vectors
166
+ 计算ground truth相对anchors的回归参数
167
+ :param loc: anchor匹配到的对应GTBOX Nx4x8732
168
+ :return:
169
+ """
170
+ gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, :] # Nx2x8732
171
+ gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log() # Nx2x8732
172
+ return torch.cat((gxy, gwh), dim=1).contiguous()
173
+
174
+ def forward(self, ploc, plabel, gloc, glabel):
175
+ # type: (Tensor, Tensor, Tensor, Tensor) -> Tensor
176
+ """
177
+ ploc, plabel: Nx4x8732, Nxlabel_numx8732
178
+ predicted location and labels
179
+
180
+ gloc, glabel: Nx4x8732, Nx8732
181
+ ground truth location and labels
182
+ """
183
+ # 获取正样本的mask Tensor: [N, 8732]
184
+ mask = torch.gt(glabel, 0) # (gt: >)
185
+ # mask1 = torch.nonzero(glabel)
186
+ # 计算一个batch中的每张图片的正样本个数 Tensor: [N]
187
+ pos_num = mask.sum(dim=1)
188
+
189
+ # 计算gt的location回归参数 Tensor: [N, 4, 8732]
190
+ vec_gd = self._location_vec(gloc)
191
+
192
+ # sum on four coordinates, and mask
193
+ # 计算定位损失(只有正样本)
194
+ loc_loss = self.location_loss(ploc, vec_gd).sum(dim=1) # Tensor: [N, 8732]
195
+ loc_loss = (mask.float() * loc_loss).sum(dim=1) # Tenosr: [N]
196
+
197
+ # hard negative mining Tenosr: [N, 8732]
198
+ con = self.confidence_loss(plabel, glabel)
199
+
200
+ # positive mask will never selected
201
+ # 获取负样本
202
+ con_neg = con.clone()
203
+ con_neg[mask] = 0.0
204
+ # 按照confidence_loss降序排列 con_idx(Tensor: [N, 8732])
205
+ _, con_idx = con_neg.sort(dim=1, descending=True)
206
+ _, con_rank = con_idx.sort(dim=1) # 这个步骤比较巧妙
207
+
208
+ # number of negative three times positive
209
+ # 用于损失计算的负样本数是正样本的3倍(在原论文Hard negative mining部分),
210
+ # 但不能超过总样本数8732
211
+ neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1)
212
+ neg_mask = torch.lt(con_rank, neg_num) # (lt: <) Tensor [N, 8732]
213
+
214
+ # confidence最终loss使用选取的正样本loss+选取的负样本loss
215
+ con_loss = (con * (mask.float() + neg_mask.float())).sum(dim=1) # Tensor [N]
216
+
217
+ # avoid no object detected
218
+ # 避免出现图像中没有GTBOX的情况
219
+ total_loss = loc_loss + con_loss
220
+ # eg. [15, 3, 5, 0] -> [1.0, 1.0, 1.0, 0.0]
221
+ num_mask = torch.gt(pos_num, 0).float() # 统计一个batch中的每张图像中是否存在正样本
222
+ pos_num = pos_num.float().clamp(min=1e-6) # 防止出现分母为零的情况
223
+ ret = (total_loss * num_mask / pos_num).mean(dim=0) # 只计算存在正样本的图像损失
224
+ return ret
225
+
sddfrcnn_model/backbone/utils.py ADDED
@@ -0,0 +1,628 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import sqrt
2
+ import itertools
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ from torch.jit.annotations import Tuple, List
7
+ from torch import nn, Tensor
8
+ import numpy as np
9
+
10
+
11
+ # This function is from https://github.com/kuangliu/pytorch-ssd.
12
+ # def calc_iou_tensor(box1, box2):
13
+ # """ Calculation of IoU based on two boxes tensor,
14
+ # Reference to https://github.com/kuangliu/pytorch-src
15
+ # input:
16
+ # box1 (N, 4) format [xmin, ymin, xmax, ymax]
17
+ # box2 (M, 4) format [xmin, ymin, xmax, ymax]
18
+ # output:
19
+ # IoU (N, M)
20
+ # """
21
+ # N = box1.size(0)
22
+ # M = box2.size(0)
23
+ #
24
+ # # (N, 4) -> (N, 1, 4) -> (N, M, 4)
25
+ # be1 = box1.unsqueeze(1).expand(-1, M, -1) # -1 means not changing the size of that dimension
26
+ # # (M, 4) -> (1, M, 4) -> (N, M, 4)
27
+ # be2 = box2.unsqueeze(0).expand(N, -1, -1)
28
+ #
29
+ # # Left Top and Right Bottom
30
+ # lt = torch.max(be1[:, :, :2], be2[:, :, :2])
31
+ # rb = torch.min(be1[:, :, 2:], be2[:, :, 2:])
32
+ #
33
+ # # compute intersection area
34
+ # delta = rb - lt # width and height
35
+ # delta[delta < 0] = 0
36
+ # # width * height
37
+ # intersect = delta[:, :, 0] * delta[:, :, 1]
38
+ #
39
+ # # compute bel1 area
40
+ # delta1 = be1[:, :, 2:] - be1[:, :, :2]
41
+ # area1 = delta1[:, :, 0] * delta1[:, :, 1]
42
+ # # compute bel2 area
43
+ # delta2 = be2[:, :, 2:] - be2[:, :, :2]
44
+ # area2 = delta2[:, :, 0] * delta2[:, :, 1]
45
+ #
46
+ # iou = intersect / (area1 + area2 - intersect)
47
+ # return iou
48
+
49
+
50
+ def box_area(boxes):
51
+ """
52
+ Computes the area of a set of bounding boxes, which are specified by its
53
+ (x1, y1, x2, y2) coordinates.
54
+
55
+ Arguments:
56
+ boxes (Tensor[N, 4]): boxes for which the area will be computed. They
57
+ are expected to be in (x1, y1, x2, y2) format
58
+
59
+ Returns:
60
+ area (Tensor[N]): area for each box
61
+ """
62
+ return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
63
+
64
+
65
+ def calc_iou_tensor(boxes1, boxes2):
66
+ """
67
+ Return intersection-over-union (Jaccard index) of boxes.
68
+
69
+ Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
70
+
71
+ Arguments:
72
+ boxes1 (Tensor[N, 4])
73
+ boxes2 (Tensor[M, 4])
74
+
75
+ Returns:
76
+ iou (Tensor[N, M]): the NxM matrix containing the pairwise
77
+ IoU values for every element in boxes1 and boxes2
78
+ """
79
+ area1 = box_area(boxes1)
80
+ area2 = box_area(boxes2)
81
+
82
+ # When the shapes do not match,
83
+ # the shape of the returned output tensor follows the broadcasting rules
84
+ lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # left-top [N,M,2]
85
+ rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # right-bottom [N,M,2]
86
+
87
+ wh = (rb - lt).clamp(min=0) # [N,M,2]
88
+ inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
89
+
90
+ iou = inter / (area1[:, None] + area2 - inter)
91
+ return iou
92
+
93
+
94
+ # This function is from https://github.com/kuangliu/pytorch-ssd.
95
+ class Encoder(object):
96
+ """
97
+ Inspired by https://github.com/kuangliu/pytorch-src
98
+ Transform between (bboxes, lables) <-> SSD output
99
+
100
+ dboxes: default boxes in size 8732 x 4,
101
+ encoder: input ltrb format, output xywh format
102
+ decoder: input xywh format, output ltrb format
103
+
104
+ encode:
105
+ input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
106
+ output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
107
+ criteria : IoU threshold of bboexes
108
+
109
+ decode:
110
+ input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
111
+ output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
112
+ criteria : IoU threshold of bboexes
113
+ max_output : maximum number of output bboxes
114
+ """
115
+ def __init__(self, dboxes):
116
+ self.dboxes = dboxes(order='ltrb')
117
+ self.dboxes_xywh = dboxes(order='xywh').unsqueeze(dim=0)
118
+ self.nboxes = self.dboxes.size(0) # default boxes的数量
119
+ self.scale_xy = dboxes.scale_xy
120
+ self.scale_wh = dboxes.scale_wh
121
+
122
+ def encode(self, bboxes_in, labels_in, criteria=0.5):
123
+ """
124
+ encode:
125
+ input : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
126
+ output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
127
+ criteria : IoU threshold of bboexes
128
+ """
129
+ # [nboxes, 8732]
130
+ ious = calc_iou_tensor(bboxes_in, self.dboxes) # 计算每个GT与default box的iou
131
+ # [8732,]
132
+ best_dbox_ious, best_dbox_idx = ious.max(dim=0) # 寻找每个default box匹配到的最大IoU
133
+ # [nboxes,]
134
+ best_bbox_ious, best_bbox_idx = ious.max(dim=1) # 寻找每个GT匹配到的最大IoU
135
+
136
+ # 将每个GT匹配到的最佳default box设置为正样本(对应论文中Matching strategy的第一条)
137
+ # set best ious 2.0
138
+ best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0) # dim, index, value
139
+ # 将相应default box匹配最大IOU的GT索引进行替换
140
+ idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
141
+ best_dbox_idx[best_bbox_idx[idx]] = idx
142
+
143
+ # filter IoU > 0.5
144
+ # 寻找与GT iou大于0.5的default box,对应论文中Matching strategy的第二条(这里包括了第一条匹配到的信息)
145
+ masks = best_dbox_ious > criteria
146
+ # [8732,]
147
+ labels_out = torch.zeros(self.nboxes, dtype=torch.int64)
148
+ labels_out[masks] = labels_in[best_dbox_idx[masks]]
149
+ # 将default box匹配到正样本的位置设置成对应GT的box信息
150
+ bboxes_out = self.dboxes.clone()
151
+ bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
152
+
153
+ # Transform format to xywh format
154
+ x = 0.5 * (bboxes_out[:, 0] + bboxes_out[:, 2]) # x
155
+ y = 0.5 * (bboxes_out[:, 1] + bboxes_out[:, 3]) # y
156
+ w = bboxes_out[:, 2] - bboxes_out[:, 0] # w
157
+ h = bboxes_out[:, 3] - bboxes_out[:, 1] # h
158
+ bboxes_out[:, 0] = x
159
+ bboxes_out[:, 1] = y
160
+ bboxes_out[:, 2] = w
161
+ bboxes_out[:, 3] = h
162
+ return bboxes_out, labels_out
163
+
164
+ def scale_back_batch(self, bboxes_in, scores_in):
165
+ """
166
+ 将box格式从xywh转换回ltrb, 将预测目标score通过softmax处理
167
+ Do scale and transform from xywh to ltrb
168
+ suppose input N x 4 x num_bbox | N x label_num x num_bbox
169
+
170
+ bboxes_in: 是网络预测的xywh回归参数
171
+ scores_in: 是预测的每个default box的各目标概率
172
+ """
173
+ if bboxes_in.device == torch.device("cpu"):
174
+ self.dboxes = self.dboxes.cpu()
175
+ self.dboxes_xywh = self.dboxes_xywh.cpu()
176
+ else:
177
+ self.dboxes = self.dboxes.cuda()
178
+ self.dboxes_xywh = self.dboxes_xywh.cuda()
179
+
180
+ # Returns a view of the original tensor with its dimensions permuted.
181
+ bboxes_in = bboxes_in.permute(0, 2, 1)
182
+ scores_in = scores_in.permute(0, 2, 1)
183
+ # print(bboxes_in.is_contiguous())
184
+
185
+ bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2] # 预测的x, y回归参数
186
+ bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:] # 预测的w, h回归参数
187
+
188
+ # 将预测的回归参数叠加到default box上得到最终的预测边界框
189
+ bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
190
+ bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
191
+
192
+ # transform format to ltrb
193
+ l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
194
+ t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
195
+ r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
196
+ b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]
197
+
198
+ bboxes_in[:, :, 0] = l # xmin
199
+ bboxes_in[:, :, 1] = t # ymin
200
+ bboxes_in[:, :, 2] = r # xmax
201
+ bboxes_in[:, :, 3] = b # ymax
202
+
203
+ return bboxes_in, F.softmax(scores_in, dim=-1)
204
+
205
+ def decode_batch(self, bboxes_in, scores_in, criteria=0.45, max_output=200):
206
+ # 将box格式从xywh转换回ltrb(方便后面非极大值抑制时求iou), 将预测目标score通过softmax处理
207
+ bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
208
+
209
+ outputs = []
210
+ # 遍历一个batch中的每张image数据
211
+ for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
212
+ bbox = bbox.squeeze(0)
213
+ prob = prob.squeeze(0)
214
+ outputs.append(self.decode_single_new(bbox, prob, criteria, max_output))
215
+ return outputs
216
+
217
+ def decode_single_new(self, bboxes_in, scores_in, criteria, num_output=200):
218
+ """
219
+ decode:
220
+ input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
221
+ output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
222
+ criteria : IoU threshold of bboexes
223
+ max_output : maximum number of output bboxes
224
+ """
225
+ device = bboxes_in.device
226
+ num_classes = scores_in.shape[-1]
227
+
228
+ # 对越界的bbox进行裁剪
229
+ bboxes_in = bboxes_in.clamp(min=0, max=1)
230
+
231
+ # [8732, 4] -> [8732, 21, 4]
232
+ bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)
233
+
234
+ # create labels for each prediction
235
+ labels = torch.arange(num_classes, device=device)
236
+ labels = labels.view(1, -1).expand_as(scores_in)
237
+
238
+ # remove prediction with the background label
239
+ # 移除归为背景类别的概率信息
240
+ bboxes_in = bboxes_in[:, 1:, :]
241
+ scores_in = scores_in[:, 1:]
242
+ labels = labels[:, 1:]
243
+
244
+ # batch everything, by making every class prediction be a separate instance
245
+ bboxes_in = bboxes_in.reshape(-1, 4)
246
+ scores_in = scores_in.reshape(-1)
247
+ labels = labels.reshape(-1)
248
+
249
+ # remove low scoring boxes
250
+ # 移除低概率目标,self.scores_thresh=0.05
251
+ inds = torch.nonzero(scores_in > 0.05, as_tuple=False).squeeze(1)
252
+ bboxes_in, scores_in, labels = bboxes_in[inds], scores_in[inds], labels[inds]
253
+
254
+ # remove empty boxes
255
+ ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
256
+ keep = (ws >= 0.1 / 300) & (hs >= 0.1 / 300)
257
+ keep = keep.nonzero(as_tuple=False).squeeze(1)
258
+ bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]
259
+
260
+ # non-maximum suppression
261
+ keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)
262
+
263
+ # keep only topk scoring predictions
264
+ keep = keep[:num_output]
265
+ bboxes_out = bboxes_in[keep, :]
266
+ scores_out = scores_in[keep]
267
+ labels_out = labels[keep]
268
+
269
+ return bboxes_out, labels_out, scores_out
270
+
271
+ # perform non-maximum suppression
272
+ def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
273
+ """
274
+ decode:
275
+ input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
276
+ output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
277
+ criteria : IoU threshold of bboexes
278
+ max_output : maximum number of output bboxes
279
+ """
280
+ # Reference to https://github.com/amdegroot/ssd.pytorch
281
+ bboxes_out = []
282
+ scores_out = []
283
+ labels_out = []
284
+
285
+ # 非极大值抑制算法
286
+ # scores_in (Tensor 8732 x nitems), 遍历返回每一列数据,即8732个目标的同一类别的概率
287
+ for i, score in enumerate(scores_in.split(1, 1)):
288
+ # skip background
289
+ if i == 0:
290
+ continue
291
+
292
+ # [8732, 1] -> [8732]
293
+ score = score.squeeze(1)
294
+
295
+ # 虑除预测概率小于0.05的目标
296
+ mask = score > 0.05
297
+ bboxes, score = bboxes_in[mask, :], score[mask]
298
+ if score.size(0) == 0:
299
+ continue
300
+
301
+ # 按照分数从小到大排序
302
+ score_sorted, score_idx_sorted = score.sort(dim=0)
303
+
304
+ # select max_output indices
305
+ score_idx_sorted = score_idx_sorted[-max_num:]
306
+ candidates = []
307
+
308
+ while score_idx_sorted.numel() > 0:
309
+ idx = score_idx_sorted[-1].item()
310
+ # 获取排名前score_idx_sorted名的bboxes信息 Tensor:[score_idx_sorted, 4]
311
+ bboxes_sorted = bboxes[score_idx_sorted, :]
312
+ # 获取排名第一的bboxes信息 Tensor:[4]
313
+ bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
314
+ # 计算前score_idx_sorted名的bboxes与第一名的bboxes的iou
315
+ iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
316
+
317
+ # we only need iou < criteria
318
+ # 丢弃与第一名iou > criteria的所有目标(包括自己本身)
319
+ score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
320
+ # 保存第一名的索引信息
321
+ candidates.append(idx)
322
+
323
+ # 保存该类别通过非极大值抑制后的目标信息
324
+ bboxes_out.append(bboxes[candidates, :]) # bbox坐标信息
325
+ scores_out.append(score[candidates]) # score信息
326
+ labels_out.extend([i] * len(candidates)) # 标签信息
327
+
328
+ if not bboxes_out: # 如果为空的话,返回空tensor,注意boxes对应的空tensor size,防止验证时出错
329
+ return [torch.empty(size=(0, 4)), torch.empty(size=(0,), dtype=torch.int64), torch.empty(size=(0,))]
330
+
331
+ bboxes_out = torch.cat(bboxes_out, dim=0).contiguous()
332
+ scores_out = torch.cat(scores_out, dim=0).contiguous()
333
+ labels_out = torch.as_tensor(labels_out, dtype=torch.long)
334
+
335
+ # 对所有目标的概率进行排序(无论是什 么类别),取前max_num个目标
336
+ _, max_ids = scores_out.sort(dim=0)
337
+ max_ids = max_ids[-max_output:]
338
+ return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
339
+
340
+
341
+ class DefaultBoxes(object):
342
+ def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, scale_xy=0.1, scale_wh=0.2):
343
+ self.fig_size = fig_size # 输入网络的图像大小 300
344
+ # [38, 19, 10, 5, 3, 1]
345
+ self.feat_size = feat_size # 每个预测层的feature map尺寸
346
+
347
+ self.scale_xy_ = scale_xy
348
+ self.scale_wh_ = scale_wh
349
+
350
+ # According to https://github.com/weiliu89/caffe
351
+ # Calculation method slightly different from paper
352
+ # [8, 16, 32, 64, 100, 300]
353
+ self.steps = steps # 每个特征层上的一个cell在原图上的跨度
354
+
355
+ # [21, 45, 99, 153, 207, 261, 315]
356
+ self.scales = scales # 每个特征层上预测的default box的scale
357
+
358
+ fk = fig_size / np.array(steps) # 计算每层特征层的fk
359
+ # [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
360
+ self.aspect_ratios = aspect_ratios # 每个预测特征层上预测的default box的ratios
361
+
362
+ self.default_boxes = []
363
+ # size of feature and number of feature
364
+ # 遍历每层特征层,计算default box
365
+ for idx, sfeat in enumerate(self.feat_size):
366
+ sk1 = scales[idx] / fig_size # scale转为相对值[0-1]
367
+ sk2 = scales[idx + 1] / fig_size # scale转为相对值[0-1]
368
+ sk3 = sqrt(sk1 * sk2)
369
+ # 先添加两个1:1比例的default box宽和高
370
+ all_sizes = [(sk1, sk1), (sk3, sk3)]
371
+
372
+ # 再将剩下不同比例的default box宽和高添加到all_sizes中
373
+ for alpha in aspect_ratios[idx]:
374
+ w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
375
+ all_sizes.append((w, h))
376
+ all_sizes.append((h, w))
377
+
378
+ # 计算当前特征层对应原图上的所有default box
379
+ for w, h in all_sizes:
380
+ for i, j in itertools.product(range(sfeat), repeat=2): # i -> 行(y), j -> 列(x)
381
+ # 计算每个default box的中心坐标(范围是在0-1之间)
382
+ cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
383
+ self.default_boxes.append((cx, cy, w, h))
384
+
385
+ # 将default_boxes转为tensor格式
386
+ self.dboxes = torch.as_tensor(self.default_boxes, dtype=torch.float32) # 这里不转类型会报错
387
+ self.dboxes.clamp_(min=0, max=1) # 将坐标(x, y, w, h)都限制在0-1之间
388
+
389
+ # For IoU calculation
390
+ # ltrb is left top coordinate and right bottom coordinate
391
+ # 将(x, y, w, h)转换成(xmin, ymin, xmax, ymax),方便后续计算IoU(匹配正负样本时)
392
+ self.dboxes_ltrb = self.dboxes.clone()
393
+ self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2] # xmin
394
+ self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3] # ymin
395
+ self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2] # xmax
396
+ self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3] # ymax
397
+
398
+ @property
399
+ def scale_xy(self):
400
+ return self.scale_xy_
401
+
402
+ @property
403
+ def scale_wh(self):
404
+ return self.scale_wh_
405
+
406
+ def __call__(self, order='ltrb'):
407
+ # 根据需求返回对应格式的default box
408
+ if order == 'ltrb':
409
+ return self.dboxes_ltrb
410
+
411
+ if order == 'xywh':
412
+ return self.dboxes
413
+
414
+
415
+ def dboxes300_coco():
416
+ figsize = 300 # 输入网络的图像大小
417
+ feat_size = [38, 19, 10, 5, 3, 1] # 每个预测层的feature map尺寸
418
+ steps = [8, 16, 32, 64, 100, 300] # 每个特征层上的一个cell在原图上的跨度
419
+ # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
420
+ scales = [21, 45, 99, 153, 207, 261, 315] # 每个特征层上预测的default box的scale
421
+ aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] # 每个预测特征层上预测的default box的ratios
422
+ dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
423
+ return dboxes
424
+
425
+
426
+ def nms(boxes, scores, iou_threshold):
427
+ # type: (Tensor, Tensor, float) -> Tensor
428
+ """
429
+ Performs non-maximum suppression (NMS) on the boxes according
430
+ to their intersection-over-union (IoU).
431
+
432
+ NMS iteratively removes lower scoring boxes which have an
433
+ IoU greater than iou_threshold with another (higher scoring)
434
+ box.
435
+
436
+ Parameters
437
+ ----------
438
+ boxes : Tensor[N, 4])
439
+ boxes to perform NMS on. They
440
+ are expected to be in (x1, y1, x2, y2) format
441
+ scores : Tensor[N]
442
+ scores for each one of the boxes
443
+ iou_threshold : float
444
+ discards all overlapping
445
+ boxes with IoU < iou_threshold
446
+
447
+ Returns
448
+ -------
449
+ keep : Tensor
450
+ int64 tensor with the indices
451
+ of the elements that have been kept
452
+ by NMS, sorted in decreasing order of scores
453
+ """
454
+ return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
455
+
456
+
457
+ def batched_nms(boxes, scores, idxs, iou_threshold):
458
+ # type: (Tensor, Tensor, Tensor, float) -> Tensor
459
+ """
460
+ Performs non-maximum suppression in a batched fashion.
461
+
462
+ Each index value correspond to a category, and NMS
463
+ will not be applied between elements of different categories.
464
+
465
+ Parameters
466
+ ----------
467
+ boxes : Tensor[N, 4]
468
+ boxes where NMS will be performed. They
469
+ are expected to be in (x1, y1, x2, y2) format
470
+ scores : Tensor[N]
471
+ scores for each one of the boxes
472
+ idxs : Tensor[N]
473
+ indices of the categories for each one of the boxes.
474
+ iou_threshold : float
475
+ discards all overlapping boxes
476
+ with IoU < iou_threshold
477
+
478
+ Returns
479
+ -------
480
+ keep : Tensor
481
+ int64 tensor with the indices of
482
+ the elements that have been kept by NMS, sorted
483
+ in decreasing order of scores
484
+ """
485
+ if boxes.numel() == 0:
486
+ return torch.empty((0,), dtype=torch.int64, device=boxes.device)
487
+
488
+ # strategy: in order to perform NMS independently per class.
489
+ # we add an offset to all the boxes. The offset is dependent
490
+ # only on the class idx, and is large enough so that boxes
491
+ # from different classes do not overlap
492
+ # 获取所有boxes中最大的坐标值(xmin, ymin, xmax, ymax)
493
+ max_coordinate = boxes.max()
494
+
495
+ # to(): Performs Tensor dtype and/or device conversion
496
+ # 为每一个类别生成一个很大的偏移量
497
+ # 这里的to只是让生成tensor的dytpe和device与boxes保持一致
498
+ offsets = idxs.to(boxes) * (max_coordinate + 1)
499
+ # boxes加上对应层的偏移量后,保证不同类别之间boxes不会有重合的现象
500
+ boxes_for_nms = boxes + offsets[:, None]
501
+ keep = nms(boxes_for_nms, scores, iou_threshold)
502
+ return keep
503
+
504
+
505
+ class PostProcess(nn.Module):
506
+ def __init__(self, dboxes):
507
+ super(PostProcess, self).__init__()
508
+ # [num_anchors, 4] -> [1, num_anchors, 4]
509
+ self.dboxes_xywh = nn.Parameter(dboxes(order='xywh').unsqueeze(dim=0),
510
+ requires_grad=False)
511
+ self.scale_xy = dboxes.scale_xy # 0.1
512
+ self.scale_wh = dboxes.scale_wh # 0.2
513
+
514
+ self.criteria = 0.5
515
+ self.max_output = 100
516
+
517
+ def scale_back_batch(self, bboxes_in, scores_in):
518
+ # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
519
+ """
520
+ 1)通过预测的boxes回归参数得到最终预测坐标
521
+ 2)将box格式从xywh转换回ltrb
522
+ 3)将预测目标score通过softmax处理
523
+ Do scale and transform from xywh to ltrb
524
+ suppose input N x 4 x num_bbox | N x label_num x num_bbox
525
+
526
+ bboxes_in: [N, 4, 8732]是网络预测的xywh回归参数
527
+ scores_in: [N, label_num, 8732]是预测的每个default box的各目标概率
528
+ """
529
+
530
+ # Returns a view of the original tensor with its dimensions permuted.
531
+ # [batch, 4, 8732] -> [batch, 8732, 4]
532
+ bboxes_in = bboxes_in.permute(0, 2, 1)
533
+ # [batch, label_num, 8732] -> [batch, 8732, label_num]
534
+ scores_in = scores_in.permute(0, 2, 1)
535
+ # print(bboxes_in.is_contiguous())
536
+
537
+ bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2] # 预测的x, y回归参数
538
+ bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:] # 预测的w, h回归参数
539
+
540
+ # 将预测的回归参数叠加到default box上得到最终的预测边界框
541
+ bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
542
+ bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
543
+
544
+ # transform format to ltrb
545
+ l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
546
+ t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
547
+ r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
548
+ b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]
549
+
550
+ bboxes_in[:, :, 0] = l # xmin
551
+ bboxes_in[:, :, 1] = t # ymin
552
+ bboxes_in[:, :, 2] = r # xmax
553
+ bboxes_in[:, :, 3] = b # ymax
554
+
555
+ # scores_in: [batch, 8732, label_num]
556
+ return bboxes_in, F.softmax(scores_in, dim=-1)
557
+
558
+ def decode_single_new(self, bboxes_in, scores_in, criteria, num_output):
559
+ # type: (Tensor, Tensor, float, int) -> Tuple[Tensor, Tensor, Tensor]
560
+ """
561
+ decode:
562
+ input : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
563
+ output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
564
+ criteria : IoU threshold of bboexes
565
+ max_output : maximum number of output bboxes
566
+ """
567
+ device = bboxes_in.device
568
+ num_classes = scores_in.shape[-1]
569
+
570
+ # 对越界的bbox进行裁剪
571
+ bboxes_in = bboxes_in.clamp(min=0, max=1)
572
+
573
+ # [8732, 4] -> [8732, 21, 4]
574
+ bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)
575
+
576
+ # create labels for each prediction
577
+ labels = torch.arange(num_classes, device=device)
578
+ # [num_classes] -> [8732, num_classes]
579
+ labels = labels.view(1, -1).expand_as(scores_in)
580
+
581
+ # remove prediction with the background label
582
+ # 移除归为背景类别的概率信息
583
+ bboxes_in = bboxes_in[:, 1:, :] # [8732, 21, 4] -> [8732, 20, 4]
584
+ scores_in = scores_in[:, 1:] # [8732, 21] -> [8732, 20]
585
+ labels = labels[:, 1:] # [8732, 21] -> [8732, 20]
586
+
587
+ # batch everything, by making every class prediction be a separate instance
588
+ bboxes_in = bboxes_in.reshape(-1, 4) # [8732, 20, 4] -> [8732x20, 4]
589
+ scores_in = scores_in.reshape(-1) # [8732, 20] -> [8732x20]
590
+ labels = labels.reshape(-1) # [8732, 20] -> [8732x20]
591
+
592
+ # remove low scoring boxes
593
+ # 移除低概率目标,self.scores_thresh=0.05
594
+ # inds = torch.nonzero(scores_in > 0.05).squeeze(1)
595
+ inds = torch.where(torch.gt(scores_in, 0.05))[0]
596
+ bboxes_in, scores_in, labels = bboxes_in[inds, :], scores_in[inds], labels[inds]
597
+
598
+ # remove empty boxes
599
+ ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
600
+ keep = (ws >= 1 / 300) & (hs >= 1 / 300)
601
+ # keep = keep.nonzero().squeeze(1)
602
+ keep = torch.where(keep)[0]
603
+ bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]
604
+
605
+ # non-maximum suppression
606
+ keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)
607
+
608
+ # keep only topk scoring predictions
609
+ keep = keep[:num_output]
610
+ bboxes_out = bboxes_in[keep, :]
611
+ scores_out = scores_in[keep]
612
+ labels_out = labels[keep]
613
+
614
+ return bboxes_out, labels_out, scores_out
615
+
616
+ def forward(self, bboxes_in, scores_in):
617
+ # 通过预测的boxes回归参数得到最终预测坐标, 将预测目标score通过softmax处理
618
+ bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
619
+
620
+ outputs = torch.jit.annotate(List[Tuple[Tensor, Tensor, Tensor]], [])
621
+ # 遍历一个batch中的每张image数据
622
+ # bboxes: [batch, 8732, 4]
623
+ for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)): # split_size, split_dim
624
+ # bbox: [1, 8732, 4]
625
+ bbox = bbox.squeeze(0)
626
+ prob = prob.squeeze(0)
627
+ outputs.append(self.decode_single_new(bbox, prob, self.criteria, self.max_output))
628
+ return outputs
sddfrcnn_model/draw_box_utils.py ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL.Image import Image, fromarray
2
+ import PIL.ImageDraw as ImageDraw
3
+ import PIL.ImageFont as ImageFont
4
+ from PIL import ImageColor
5
+ import numpy as np
6
+
7
+ STANDARD_COLORS = [
8
+ 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
9
+ 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
10
+ 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
11
+ 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
12
+ 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
13
+ 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
14
+ 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
15
+ 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
16
+ 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
17
+ 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
18
+ 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
19
+ 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
20
+ 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
21
+ 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
22
+ 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
23
+ 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
24
+ 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
25
+ 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
26
+ 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
27
+ 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
28
+ 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
29
+ 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
30
+ 'WhiteSmoke', 'Yellow', 'YellowGreen'
31
+ ]
32
+
33
+
34
+ def draw_text(draw,
35
+ box: list,
36
+ cls: int,
37
+ score: float,
38
+ category_index: dict,
39
+ color: str,
40
+ font: str = 'arial.ttf',
41
+ font_size: int = 24):
42
+ """
43
+ 将目标边界框和类别信息绘制到图片上
44
+ """
45
+ try:
46
+ font = ImageFont.truetype(font, font_size)
47
+ except IOError:
48
+ font = ImageFont.load_default()
49
+
50
+ left, top, right, bottom = box
51
+ # If the total height of the display strings added to the top of the bounding
52
+ # box exceeds the top of the image, stack the strings below the bounding box
53
+ # instead of above.
54
+ display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
55
+ """
56
+ display_str_heights = [draw.textsize(ds,font=font)[1] for ds in display_str]
57
+ """
58
+ bbox = draw.textbbox((0, 0), display_str, font=font)
59
+ text_width = bbox[2] - bbox[0]
60
+ text_height = bbox[3] - bbox[1]
61
+ # Each display_str has a top and bottom margin of 0.05x.
62
+ display_str_height = (1 + 2 * 0.05) * text_height
63
+
64
+ if top > display_str_height:
65
+ text_top = top - display_str_height
66
+ text_bottom = top
67
+ else:
68
+ text_top = bottom
69
+ text_bottom = bottom + display_str_height
70
+ margin = np.ceil(0.05 * text_width)
71
+ text_rect_left = left
72
+ text_rect_right = left + text_width + 2 * margin
73
+
74
+ # 确保文本矩形不超出图像边界
75
+ img_width, img_height = draw.im.size
76
+ if text_rect_right > img_width:
77
+ text_rect_right = img_width
78
+ text_rect_left = max(0, img_width - text_width - 2 * margin)
79
+
80
+ if text_bottom > img_height:
81
+ text_bottom = img_height
82
+ text_top = max(0, img_height - display_str_height)
83
+
84
+ # 绘制文本背景和文本
85
+ draw.rectangle([(text_rect_left, text_top),
86
+ (text_rect_right, text_bottom)], fill=color)
87
+ draw.text((text_rect_left + margin, text_top),
88
+ display_str,
89
+ fill='black',
90
+ font=font)
91
+ '''
92
+ for ds in display_str:
93
+ """
94
+ text_width, text_height = draw.textsize(text, font=font)
95
+ """
96
+ bbox = draw.textbbox((0, 0), display_str, font=font)
97
+ text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
98
+ margin = np.ceil(0.05 * text_width)
99
+ draw.rectangle([(left, text_top),
100
+ (left + text_width + 2 * margin, text_bottom)], fill=color)
101
+ draw.text((left + margin, text_top),
102
+ ds,
103
+ fill='black',
104
+ font=font)
105
+ left += text_width
106
+ '''
107
+
108
+ def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
109
+ np_image = np.array(image)
110
+ masks = np.where(masks > thresh, True, False)
111
+
112
+ # colors = np.array(colors)
113
+ img_to_draw = np.copy(np_image)
114
+ # TODO: There might be a way to vectorize this
115
+ for mask, color in zip(masks, colors):
116
+ img_to_draw[mask] = color
117
+
118
+ out = np_image * (1 - alpha) + img_to_draw * alpha
119
+ return fromarray(out.astype(np.uint8))
120
+
121
+
122
+ def draw_objs(image: Image,
123
+ boxes: np.ndarray = None,
124
+ classes: np.ndarray = None,
125
+ scores: np.ndarray = None,
126
+ masks: np.ndarray = None,
127
+ category_index: dict = None,
128
+ box_thresh: float = 0.1,
129
+ mask_thresh: float = 0.5,
130
+ line_thickness: int = 8,
131
+ font: str = 'arial.ttf',
132
+ font_size: int = 24,
133
+ draw_boxes_on_image: bool = True,
134
+ draw_masks_on_image: bool = False):
135
+ """
136
+ 将目标边界框信息,类别信息,mask信息绘制在图片上
137
+ Args:
138
+ image: 需要绘制的图片
139
+ boxes: 目标边界框信息
140
+ classes: 目标类别信息
141
+ scores: 目标概率信息
142
+ masks: 目标mask信息
143
+ category_index: 类别与名称字典
144
+ box_thresh: 过滤的概率阈值
145
+ mask_thresh:
146
+ line_thickness: 边界框宽度
147
+ font: 字体类型
148
+ font_size: 字体大小
149
+ draw_boxes_on_image:
150
+ draw_masks_on_image:
151
+
152
+ Returns:
153
+
154
+ """
155
+
156
+ # 过滤掉低概率的目标
157
+ idxs = np.greater(scores, box_thresh)
158
+ boxes = boxes[idxs]
159
+ classes = classes[idxs]
160
+ scores = scores[idxs]
161
+ if masks is not None:
162
+ masks = masks[idxs]
163
+ if len(boxes) == 0:
164
+ return image
165
+
166
+ colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
167
+
168
+ if draw_boxes_on_image:
169
+ # Draw all boxes onto image.
170
+ draw = ImageDraw.Draw(image)
171
+ for box, cls, score, color in zip(boxes, classes, scores, colors):
172
+ left, top, right, bottom = box
173
+ # 绘制目标边界框
174
+
175
+ img_width, img_height = image.size
176
+ left = max(0, min(left, img_width - 1))
177
+ top = max(0, min(top, img_height - 1))
178
+ right = max(0, min(right, img_width - 1))
179
+ bottom = max(0, min(bottom, img_height - 1))
180
+
181
+ # 绘制目标边界框(四条线段,而不是五条)
182
+ draw.line([(left, top), (right, top)], width=line_thickness, fill=color) # 上边
183
+ draw.line([(right, top), (right, bottom)], width=line_thickness, fill=color) # 右边
184
+ draw.line([(right, bottom), (left, bottom)], width=line_thickness, fill=color) # 下边
185
+ draw.line([(left, bottom), (left, top)], width=line_thickness, fill=color) # 左边
186
+ '''
187
+ draw.line([(left, top), (left, bottom), (right, bottom),
188
+ (right, top), (left, top)], width=line_thickness, fill=color)
189
+ '''
190
+ # 绘制类别和概率信息
191
+ draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
192
+
193
+ if draw_masks_on_image and (masks is not None):
194
+ # Draw all mask onto image.
195
+ image = draw_masks(image, masks, colors, mask_thresh)
196
+
197
+ return image
sddfrcnn_model/network_files/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .retinanet import RetinaNet
sddfrcnn_model/network_files/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (212 Bytes). View file
 
sddfrcnn_model/network_files/__pycache__/anchor_utils.cpython-310.pyc ADDED
Binary file (5.19 kB). View file
 
sddfrcnn_model/network_files/__pycache__/boxes.cpython-310.pyc ADDED
Binary file (5.12 kB). View file
 
sddfrcnn_model/network_files/__pycache__/det_utils.cpython-310.pyc ADDED
Binary file (10.6 kB). View file
 
sddfrcnn_model/network_files/__pycache__/image_list.cpython-310.pyc ADDED
Binary file (1.15 kB). View file
 
sddfrcnn_model/network_files/__pycache__/losses.cpython-310.pyc ADDED
Binary file (1.91 kB). View file
 
sddfrcnn_model/network_files/__pycache__/retinanet.cpython-310.pyc ADDED
Binary file (16.2 kB). View file
 
sddfrcnn_model/network_files/__pycache__/transform.cpython-310.pyc ADDED
Binary file (8.89 kB). View file
 
sddfrcnn_model/network_files/anchor_utils.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Optional, Dict
2
+
3
+ import torch
4
+ from torch import nn, Tensor
5
+
6
+ from .image_list import ImageList
7
+
8
+
9
+ class AnchorsGenerator(nn.Module):
10
+ __annotations__ = {
11
+ "cell_anchors": Optional[List[torch.Tensor]],
12
+ "_cache": Dict[str, List[torch.Tensor]]
13
+ }
14
+
15
+ """
16
+ anchors生成器
17
+ Module that generates anchors for a set of feature maps and
18
+ image sizes.
19
+
20
+ The module support computing anchors at multiple sizes and aspect ratios
21
+ per feature map.
22
+
23
+ sizes and aspect_ratios should have the same number of elements, and it should
24
+ correspond to the number of feature maps.
25
+
26
+ sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
27
+ and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
28
+ per spatial location for feature map i.
29
+
30
+ Arguments:
31
+ sizes (Tuple[Tuple[int]]):
32
+ aspect_ratios (Tuple[Tuple[float]]):
33
+ """
34
+
35
+ def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
36
+ super(AnchorsGenerator, self).__init__()
37
+
38
+ if not isinstance(sizes[0], (list, tuple)):
39
+ # TODO change this
40
+ sizes = tuple((s,) for s in sizes)
41
+ if not isinstance(aspect_ratios[0], (list, tuple)):
42
+ aspect_ratios = (aspect_ratios,) * len(sizes)
43
+
44
+ assert len(sizes) == len(aspect_ratios)
45
+
46
+ self.sizes = sizes
47
+ self.aspect_ratios = aspect_ratios
48
+ self.cell_anchors = None
49
+ self._cache = {}
50
+
51
+ def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device=torch.device("cpu")):
52
+ # type: (List[int], List[float], torch.dtype, torch.device) -> Tensor
53
+ """
54
+ compute anchor sizes
55
+ Arguments:
56
+ scales: sqrt(anchor_area)
57
+ aspect_ratios: h/w ratios
58
+ dtype: float32
59
+ device: cpu/gpu
60
+ """
61
+ scales = torch.as_tensor(scales, dtype=dtype, device=device)
62
+ aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
63
+ h_ratios = torch.sqrt(aspect_ratios)
64
+ w_ratios = 1.0 / h_ratios
65
+
66
+ # [r1, r2, r3]' * [s1, s2, s3]
67
+ # number of elements is len(ratios)*len(scales)
68
+ ws = (w_ratios[:, None] * scales[None, :]).view(-1)
69
+ hs = (h_ratios[:, None] * scales[None, :]).view(-1)
70
+
71
+ # left-top, right-bottom coordinate relative to anchor center(0, 0)
72
+ # 生成的anchors模板都是以(0, 0)为中心的, shape [len(ratios)*len(scales), 4]
73
+ base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
74
+
75
+ return base_anchors.round() # round 四舍五入
76
+
77
+ def set_cell_anchors(self, dtype, device):
78
+ # type: (torch.dtype, torch.device) -> None
79
+ if self.cell_anchors is not None:
80
+ cell_anchors = self.cell_anchors
81
+ assert cell_anchors is not None
82
+ # suppose that all anchors have the same device
83
+ # which is a valid assumption in the current state of the codebase
84
+ if cell_anchors[0].device == device:
85
+ return
86
+
87
+ # 根据提供的sizes和aspect_ratios生成anchors模板
88
+ # anchors模板都是以(0, 0)为中心的anchor
89
+ cell_anchors = [
90
+ self.generate_anchors(sizes, aspect_ratios, dtype, device)
91
+ for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)
92
+ ]
93
+ self.cell_anchors = cell_anchors
94
+
95
+ def num_anchors_per_location(self):
96
+ # 计算每个预测特征层上每个滑动窗口的预测目标数
97
+ return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
98
+
99
+ # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
100
+ # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
101
+ def grid_anchors(self, grid_sizes, strides):
102
+ # type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
103
+ """
104
+ anchors position in grid coordinate axis map into origin image
105
+ 计算预测特征图对应原始图像上的所有anchors的坐标
106
+ Args:
107
+ grid_sizes: 预测特征矩阵的height和width
108
+ strides: 预测特征矩阵上一步对应原始图像上的步距
109
+ """
110
+ anchors = []
111
+ cell_anchors = self.cell_anchors
112
+ assert cell_anchors is not None
113
+
114
+ # 遍历每个预测特征层的grid_size,strides和cell_anchors
115
+ for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
116
+ grid_height, grid_width = size
117
+ stride_height, stride_width = stride
118
+ device = base_anchors.device
119
+
120
+ # For output anchor, compute [x_center, y_center, x_center, y_center]
121
+ # shape: [grid_width] 对应原图上的x坐标(列)
122
+ shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width
123
+ # shape: [grid_height] 对应原图上的y坐标(行)
124
+ shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height
125
+
126
+ # 计算预测特征矩阵上每个点对应原图上的坐标(anchors模板的坐标偏移量)
127
+ # torch.meshgrid函数分别传入行坐标和列坐标,生成网格行坐标矩阵和网格列坐标矩阵
128
+ # shape: [grid_height, grid_width]
129
+ shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
130
+ shift_x = shift_x.reshape(-1)
131
+ shift_y = shift_y.reshape(-1)
132
+
133
+ # 计算anchors坐标(xmin, ymin, xmax, ymax)在原图上的坐标偏移量
134
+ # shape: [grid_width*grid_height, 4]
135
+ shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
136
+
137
+ # For every (base anchor, output anchor) pair,
138
+ # offset each zero-centered base anchor by the center of the output anchor.
139
+ # 将anchors模板与原图上的坐标偏移量相加得到原图上所有anchors的坐标信息(shape不同时会使用广播机制)
140
+ shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
141
+ anchors.append(shifts_anchor.reshape(-1, 4))
142
+
143
+ return anchors # List[Tensor(all_num_anchors, 4)]
144
+
145
+ def cached_grid_anchors(self, grid_sizes, strides):
146
+ # type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
147
+ """将计算得到的所有anchors信息进行缓存"""
148
+ key = str(grid_sizes) + str(strides)
149
+ # self._cache是字典类型
150
+ if key in self._cache:
151
+ return self._cache[key]
152
+ anchors = self.grid_anchors(grid_sizes, strides)
153
+ self._cache[key] = anchors
154
+ return anchors
155
+
156
+ def forward(self, image_list, feature_maps):
157
+ # type: (ImageList, List[Tensor]) -> List[Tensor]
158
+ # 获取每个预测特征层的尺寸(height, width)
159
+ grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
160
+
161
+ # 获取输入图像的height和width
162
+ image_size = image_list.tensors.shape[-2:]
163
+
164
+ # 获取变量类型和设备类型
165
+ dtype, device = feature_maps[0].dtype, feature_maps[0].device
166
+
167
+ # one step in feature map equate n pixel stride in origin image
168
+ # 计算特征层上的一步等于原始图像上的步长
169
+ strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
170
+ torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
171
+
172
+ # 根据提供的sizes和aspect_ratios生成anchors模板
173
+ self.set_cell_anchors(dtype, device)
174
+
175
+ # 计算/读取所有anchors的坐标信息(这里的anchors信息是映射到原图上的所有anchors信息,不是anchors模板)
176
+ # 得到的是一个list列表,对应每张预测特征图映射回原图的anchors坐标信息
177
+ anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides)
178
+
179
+ anchors = torch.jit.annotate(List[List[torch.Tensor]], [])
180
+ # 遍历一个batch中的每张图像
181
+ for i, (image_height, image_width) in enumerate(image_list.image_sizes):
182
+ anchors_in_image = []
183
+ # 遍历每张预测特征图映射回原图的anchors坐标信息
184
+ for anchors_per_feature_map in anchors_over_all_feature_maps:
185
+ anchors_in_image.append(anchors_per_feature_map)
186
+ anchors.append(anchors_in_image)
187
+ # 将每一张图像的所有预测特征层的anchors坐标信息拼接在一起
188
+ # anchors是个list,每个元素为一张图像的所有anchors信息
189
+ anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
190
+ # Clear the cache in case that memory leaks.
191
+ self._cache.clear()
192
+ return anchors