HZSDU commited on Sep 11, 2025

Commit

dfb6163

verified ·

1 Parent(s): 475273d

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +12 -0
assets/1.jpg +3 -0
assets/2.jpg +3 -0
assets/3.jpg +3 -0
assets/4.jpg +3 -0
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738985521.Iflight +3 -0
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995505.Iflight +3 -0
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995594.Iflight +3 -0
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995658.Iflight +3 -0
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742698.autodl-container-10a44fbcf4-a7468946 +3 -0
checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742996.autodl-container-10a44fbcf4-a7468946 +3 -0
checkpoints/imagenet/hole_benchmark/gen_00430000.pt +3 -0
checkpoints/ostracoda_cyclegan/latest_net_D_A.pth +3 -0
checkpoints/ostracoda_cyclegan/latest_net_D_B.pth +3 -0
data/style/11.png +3 -0
data/style/32.jpg +3 -0
data/style/6.jpg +3 -0
data/style/7.jpg +3 -0
data/texture/16.jpg +3 -0
data/texture/17.jpg +3 -0
data/texture/4.jpg +3 -0
data/texture/8.jpg +3 -0
model/tokenizer/tokenizer_config.json +34 -0
model/tokenizer/vocab.json +0 -0
model/unet/config.json +36 -0
model/vae/config.json +29 -0
sddfrcnn_model/__pycache__/draw_box_utils.cpython-310.pyc +0 -0
sddfrcnn_model/backbone/__init__.py +3 -0
sddfrcnn_model/backbone/__pycache__/__init__.cpython-310.pyc +0 -0
sddfrcnn_model/backbone/__pycache__/feature_pyramid_network.cpython-310.pyc +0 -0
sddfrcnn_model/backbone/__pycache__/res50_backbone.cpython-310.pyc +0 -0
sddfrcnn_model/backbone/__pycache__/resnet50_fpn_model.cpython-310.pyc +0 -0
sddfrcnn_model/backbone/__pycache__/ssd_model.cpython-310.pyc +0 -0
sddfrcnn_model/backbone/__pycache__/utils.cpython-310.pyc +0 -0
sddfrcnn_model/backbone/feature_pyramid_network.py +283 -0
sddfrcnn_model/backbone/res50_backbone.py +106 -0
sddfrcnn_model/backbone/resnet50_fpn_model.py +199 -0
sddfrcnn_model/backbone/ssd_model.py +225 -0
sddfrcnn_model/backbone/utils.py +628 -0
sddfrcnn_model/draw_box_utils.py +197 -0
sddfrcnn_model/network_files/__init__.py +1 -0
sddfrcnn_model/network_files/__pycache__/__init__.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/__pycache__/anchor_utils.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/__pycache__/boxes.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/__pycache__/det_utils.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/__pycache__/image_list.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/__pycache__/losses.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/__pycache__/retinanet.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/__pycache__/transform.cpython-310.pyc +0 -0
sddfrcnn_model/network_files/anchor_utils.py +192 -0

.gitattributes CHANGED Viewed

@@ -65,3 +65,15 @@ data/style/5.jpg filter=lfs diff=lfs merge=lfs -text
 data/style/59.png filter=lfs diff=lfs merge=lfs -text
 data/texture/14.jpg filter=lfs diff=lfs merge=lfs -text
 data/texture/15.jpg filter=lfs diff=lfs merge=lfs -text

 data/style/59.png filter=lfs diff=lfs merge=lfs -text
 data/texture/14.jpg filter=lfs diff=lfs merge=lfs -text
 data/texture/15.jpg filter=lfs diff=lfs merge=lfs -text
+data/texture/16.jpg filter=lfs diff=lfs merge=lfs -text
+data/texture/4.jpg filter=lfs diff=lfs merge=lfs -text
+data/style/7.jpg filter=lfs diff=lfs merge=lfs -text
+data/style/11.png filter=lfs diff=lfs merge=lfs -text
+data/texture/8.jpg filter=lfs diff=lfs merge=lfs -text
+data/style/6.jpg filter=lfs diff=lfs merge=lfs -text
+data/style/32.jpg filter=lfs diff=lfs merge=lfs -text
+data/texture/17.jpg filter=lfs diff=lfs merge=lfs -text
+assets/2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/3.jpg filter=lfs diff=lfs merge=lfs -text
+assets/4.jpg filter=lfs diff=lfs merge=lfs -text
+assets/1.jpg filter=lfs diff=lfs merge=lfs -text

assets/1.jpg ADDED Viewed

Git LFS Details

SHA256: f112aa97b1e8ea91953685b7d7e815f611dbebcadac6aaf6cb8efc7539fb4cea
Pointer size: 132 Bytes
Size of remote file: 4.3 MB

assets/2.jpg ADDED Viewed

Git LFS Details

SHA256: 439bf0f3f75d45188122f7351163ec939d13d2a21ac86ab8c7964e6b0ac5d098
Pointer size: 132 Bytes
Size of remote file: 2.41 MB

assets/3.jpg ADDED Viewed

Git LFS Details

SHA256: 155507d2603f2afd797afb6f07a7cd533767620e55875893735d60718fd6688c
Pointer size: 132 Bytes
Size of remote file: 3.93 MB

assets/4.jpg ADDED Viewed

Git LFS Details

SHA256: 898d8c33ab6f3ae1be79fead16ac261b3950c065bd2f308035d887b5666ca179
Pointer size: 132 Bytes
Size of remote file: 2.54 MB

checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738985521.Iflight ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:970514f929b756e7026179fd443b21eff57e61901c16d3bbd3af81afe0de53dd
+size 40

checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995505.Iflight ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db215c86a3dfe491ef28a21766d485c84edf70099f98464be9fa1cddd3f4e633
+size 40

checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995594.Iflight ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bde18faa073f516a72bbdd40bb5e08e5fb8da56914455612b49d9c8d85b3cca8
+size 40

checkpoints/imagenet/hole_benchmark/events.out.tfevents.1738995658.Iflight ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7127eee0799d2360a417549a82c10cc3b12ec09f9015495257ec92e55383894
+size 40

checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742698.autodl-container-10a44fbcf4-a7468946 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f75095a66582caded1c54a9fa7cc3e7edc13d3e9e17a559929cace1f64f6f7e2
+size 40

checkpoints/imagenet/hole_benchmark/events.out.tfevents.1745742996.autodl-container-10a44fbcf4-a7468946 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79c1f85ed120d2ac696dfa584e7368b9a5eae288f5e6d938ff68788146279d5c
+size 152845

checkpoints/imagenet/hole_benchmark/gen_00430000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ee688f6cf0649a0eeea9c4623719eeab52bf39f2a5f2dabf80cbcf1995f289b3
+size 14443538

checkpoints/ostracoda_cyclegan/latest_net_D_A.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0a52376b4c7fdb72089e48a3aa1e9c6f3f26576dba68c01f058643baf4506944
+size 11063002

checkpoints/ostracoda_cyclegan/latest_net_D_B.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7894e2f5f98edad44b0eb0202ce5b6f64669a0436099caaf36719ea2a8e963eb
+size 11063002

data/style/11.png ADDED Viewed

Git LFS Details

SHA256: 9b79ca19fe8ce49287186cb749f881fa3702313b1bb672fbda9208e22c5f4733
Pointer size: 132 Bytes
Size of remote file: 1.49 MB

data/style/32.jpg ADDED Viewed

Git LFS Details

SHA256: 80c208e2563530e15344b6b5be65839e0d9451baa306ee80efd7304f61eeae58
Pointer size: 131 Bytes
Size of remote file: 853 kB

data/style/6.jpg ADDED Viewed

Git LFS Details

SHA256: fc10e4005cf93c94ed5e02b536efc21f79725c99efabd52bddd05ac94e7821ca
Pointer size: 131 Bytes
Size of remote file: 189 kB

data/style/7.jpg ADDED Viewed

Git LFS Details

SHA256: cb963419f2601053fc0198bff30b331cb04fcb82ed73787d6b8132f2d44628fc
Pointer size: 131 Bytes
Size of remote file: 399 kB

data/texture/16.jpg ADDED Viewed

Git LFS Details

SHA256: b449c7cbc71da8e6c90250b89bd4f8147198c950984e043662b67fe6a201c35d
Pointer size: 131 Bytes
Size of remote file: 194 kB

data/texture/17.jpg ADDED Viewed

Git LFS Details

SHA256: 8333185aeb3ad656c461c6ac6926187fb9b0adef69b267a512e3add788940ec5
Pointer size: 131 Bytes
Size of remote file: 391 kB

data/texture/4.jpg ADDED Viewed

Git LFS Details

SHA256: 638b93cb3a43b5ff4f2c92097928a1822afc4cf72a985b5fede6dc245cc82615
Pointer size: 131 Bytes
Size of remote file: 312 kB

data/texture/8.jpg ADDED Viewed

Git LFS Details

SHA256: c145ee768560ed8848967c30caaab4dbb75e73bc6e10d8622502ef7966ea04e7
Pointer size: 131 Bytes
Size of remote file: 111 kB

model/tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "name_or_path": "openai/clip-vit-large-patch14",
+  "pad_token": "<|endoftext|>",
+  "special_tokens_map_file": "./special_tokens_map.json",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

model/tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

model/unet/config.json ADDED Viewed

	@@ -0,0 +1,36 @@

+{
+  "_class_name": "UNet2DConditionModel",
+  "_diffusers_version": "0.6.0",
+  "act_fn": "silu",
+  "attention_head_dim": 8,
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "cross_attention_dim": 768,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "layers_per_block": 2,
+  "mid_block_scale_factor": 1,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "out_channels": 4,
+  "sample_size": 64,
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ]
+}

model/vae/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.6.0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 512,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

sddfrcnn_model/__pycache__/draw_box_utils.cpython-310.pyc ADDED Viewed

Binary file (5.2 kB). View file

sddfrcnn_model/backbone/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .feature_pyramid_network import FeaturePyramidNetwork, LastLevelP6P7, LastLevelMaxPool
+from .resnet50_fpn_model import resnet50_fpn_backbone
+from .ssd_model import SSD300,Backbone

sddfrcnn_model/backbone/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (407 Bytes). View file

sddfrcnn_model/backbone/__pycache__/feature_pyramid_network.cpython-310.pyc ADDED Viewed

Binary file (10.1 kB). View file

sddfrcnn_model/backbone/__pycache__/res50_backbone.cpython-310.pyc ADDED Viewed

Binary file (3.27 kB). View file

sddfrcnn_model/backbone/__pycache__/resnet50_fpn_model.cpython-310.pyc ADDED Viewed

Binary file (6.26 kB). View file

sddfrcnn_model/backbone/__pycache__/ssd_model.cpython-310.pyc ADDED Viewed

Binary file (6.61 kB). View file

sddfrcnn_model/backbone/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (14.9 kB). View file

sddfrcnn_model/backbone/feature_pyramid_network.py ADDED Viewed

	@@ -0,0 +1,283 @@

+from collections import OrderedDict
+import torch.nn as nn
+import torch
+from torch import Tensor
+import torch.nn.functional as F
+from torch.jit.annotations import Tuple, List, Dict
+class IntermediateLayerGetter(nn.ModuleDict):
+    """
+    Module wrapper that returns intermediate layers from a model
+    It has a strong assumption that the modules have been registered
+    into the model in the same order as they are used.
+    This means that one should **not** reuse the same nn.Module
+    twice in the forward if you want this to work.
+    Additionally, it is only able to query submodules that are directly
+    assigned to the model. So if `model` is passed, `model.feature1` can
+    be returned, but not `model.feature1.layer2`.
+    Arguments:
+        model (nn.Module): model on which we will extract the features
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+    """
+    __annotations__ = {
+        "return_layers": Dict[str, str],
+    }
+    def __init__(self, model, return_layers):
+        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
+            raise ValueError("return_layers are not present in model")
+        orig_return_layers = return_layers
+        return_layers = {str(k): str(v) for k, v in return_layers.items()}
+        layers = OrderedDict()
+        # 遍历模型子模块按顺序存入有序字典
+        # 只保存layer4及其之前的结构，舍去之后不用的结构
+        for name, module in model.named_children():
+            layers[name] = module
+            if name in return_layers:
+                del return_layers[name]
+            if not return_layers:
+                break
+        super().__init__(layers)
+        self.return_layers = orig_return_layers
+    def forward(self, x):
+        out = OrderedDict()
+        # 依次遍历模型的所有子模块，并进行正向传播，
+        # 收集layer1, layer2, layer3, layer4的输出
+        for name, module in self.items():
+            x = module(x)
+            if name in self.return_layers:
+                out_name = self.return_layers[name]
+                out[out_name] = x
+        return out
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediatLayerGetter apply here.
+    Arguments:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+        extra_blocks: ExtraFPNBlock
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+    def __init__(self,
+                 backbone: nn.Module,
+                 return_layers=None,
+                 in_channels_list=None,
+                 out_channels=256,
+                 extra_blocks=None,
+                 re_getter=True):
+        super().__init__()
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+        if re_getter:
+            assert return_layers is not None
+            self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        else:
+            self.body = backbone
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+            )
+        self.out_channels = out_channels
+    def forward(self, x):
+        x = self.body(x)
+        x = self.fpn(x)
+        return x
+class ExtraFPNBlock(nn.Module):
+    """
+    Base class for the extra block in the FPN.
+    Args:
+        results (List[Tensor]): the result of the FPN
+        x (List[Tensor]): the original feature maps
+        names (List[str]): the names for each one of the
+            original feature maps
+    Returns:
+        results (List[Tensor]): the extended set of results
+            of the FPN
+        names (List[str]): the extended set of names for the results
+    """
+    def forward(self,
+                results: List[Tensor],
+                x: List[Tensor],
+                names: List[str]) -> Tuple[List[Tensor], List[str]]:
+        pass
+class LastLevelMaxPool(torch.nn.Module):
+    """
+    Applies a max_pool2d on top of the last feature map
+    """
+    def forward(self, x: List[Tensor], y: List[Tensor], names: List[str]) -> Tuple[List[Tensor], List[str]]:
+        names.append("pool")
+        x.append(F.max_pool2d(x[-1], 1, 2, 0))
+        return x, names
+class LastLevelP6P7(ExtraFPNBlock):
+    """
+    This module is used in RetinaNet to generate extra layers, P6 and P7.
+    """
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.p6 = nn.Conv2d(in_channels, out_channels, 3, 2, 1)
+        self.p7 = nn.Conv2d(out_channels, out_channels, 3, 2, 1)
+        for module in [self.p6, self.p7]:
+            nn.init.kaiming_uniform_(module.weight, a=1)
+            nn.init.constant_(module.bias, 0)
+        self.use_P5 = in_channels == out_channels
+    def forward(self,
+                p: List[Tensor],
+                c: List[Tensor],
+                names: List[str]) -> Tuple[List[Tensor], List[str]]:
+        p5, c5 = p[-1], c[-1]
+        x = p5 if self.use_P5 else c5
+        p6 = self.p6(x)
+        p7 = self.p7(F.relu(p6))
+        p.extend([p6, p7])
+        names.extend(["p6", "p7"])
+        return p, names
+class FeaturePyramidNetwork(nn.Module):
+    """
+    Module that adds a FPN from on top of a set of feature maps. This is based on
+    `"Feature Pyramid Network for Object Detection" <https://arxiv.org/abs/1612.03144>`_.
+    The feature maps are currently supposed to be in increasing depth
+    order.
+    The input to the model is expected to be an OrderedDict[Tensor], containing
+    the feature maps on top of which the FPN will be added.
+    Arguments:
+        in_channels_list (list[int]): number of channels for each feature map that
+            is passed to the module
+        out_channels (int): number of channels of the FPN representation
+        extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
+            be performed. It is expected to take the fpn features, the original
+            features and the names of the original features as input, and returns
+            a new list of feature maps and their corresponding names
+    """
+    def __init__(self, in_channels_list, out_channels, extra_blocks=None):
+        super().__init__()
+        # 用来调整resnet特征矩阵(layer1,2,3,4)的channel（kernel_size=1）
+        self.inner_blocks = nn.ModuleList()
+        # 对调整后的特征矩阵使用3x3的卷积核来得到对应的预测特征矩阵
+        self.layer_blocks = nn.ModuleList()
+        for in_channels in in_channels_list:
+            if in_channels == 0:
+                continue
+            inner_block_module = nn.Conv2d(in_channels, out_channels, 1)
+            layer_block_module = nn.Conv2d(out_channels, out_channels, 3, padding=1)
+            self.inner_blocks.append(inner_block_module)
+            self.layer_blocks.append(layer_block_module)
+        # initialize parameters now to avoid modifying the initialization of top_blocks
+        for m in self.children():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_uniform_(m.weight, a=1)
+                nn.init.constant_(m.bias, 0)
+        self.extra_blocks = extra_blocks
+    def get_result_from_inner_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.inner_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.inner_blocks)
+        if idx < 0:
+            idx += num_blocks
+        i = 0
+        out = x
+        for module in self.inner_blocks:
+            if i == idx:
+                out = module(x)
+            i += 1
+        return out
+    def get_result_from_layer_blocks(self, x: Tensor, idx: int) -> Tensor:
+        """
+        This is equivalent to self.layer_blocks[idx](x),
+        but torchscript doesn't support this yet
+        """
+        num_blocks = len(self.layer_blocks)
+        if idx < 0:
+            idx += num_blocks
+        i = 0
+        out = x
+        for module in self.layer_blocks:
+            if i == idx:
+                out = module(x)
+            i += 1
+        return out
+    def forward(self, x: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        """
+        Computes the FPN for a set of feature maps.
+        Arguments:
+            x (OrderedDict[Tensor]): feature maps for each feature level.
+        Returns:
+            results (OrderedDict[Tensor]): feature maps after FPN layers.
+                They are ordered from highest resolution first.
+        """
+        # unpack OrderedDict into two lists for easier handling
+        names = list(x.keys())
+        x = list(x.values())
+        # 将resnet layer4的channel调整到指定的out_channels
+        # last_inner = self.inner_blocks[-1](x[-1])
+        last_inner = self.get_result_from_inner_blocks(x[-1], -1)
+        # result中保存着每个预测特征层
+        results = []
+        # 将layer4调整channel后的特征矩阵，通过3x3卷积后得到对应的预测特征矩阵
+        # results.append(self.layer_blocks[-1](last_inner))
+        results.append(self.get_result_from_layer_blocks(last_inner, -1))
+        for idx in range(len(x) - 2, -1, -1):
+            inner_lateral = self.get_result_from_inner_blocks(x[idx], idx)
+            feat_shape = inner_lateral.shape[-2:]
+            inner_top_down = F.interpolate(last_inner, size=feat_shape, mode="nearest")
+            last_inner = inner_lateral + inner_top_down
+            results.insert(0, self.get_result_from_layer_blocks(last_inner, idx))
+        # 在layer4对应的预测特征层基础上生成预测特征矩阵5
+        if self.extra_blocks is not None:
+            results, names = self.extra_blocks(results, x, names)
+        # make it back an OrderedDict
+        out = OrderedDict([(k, v) for k, v in zip(names, results)])
+        return out

sddfrcnn_model/backbone/res50_backbone.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch.nn as nn
+import torch
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, in_channel, out_channel, stride=1, downsample=None):
+        super(Bottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
+                               kernel_size=1, stride=1, bias=False)  # squeeze channels
+        self.bn1 = nn.BatchNorm2d(out_channel)
+        # -----------------------------------------
+        self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
+                               kernel_size=3, stride=stride, bias=False, padding=1)
+        self.bn2 = nn.BatchNorm2d(out_channel)
+        # -----------------------------------------
+        self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel*self.expansion,
+                               kernel_size=1, stride=1, bias=False)  # unsqueeze channels
+        self.bn3 = nn.BatchNorm2d(out_channel*self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+    def forward(self, x):
+        identity = x
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, blocks_num, num_classes=1000, include_top=True):
+        super(ResNet, self).__init__()
+        self.include_top = include_top
+        self.in_channel = 64
+        self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
+                               padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.in_channel)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, blocks_num[0])
+        self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
+        if self.include_top:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))  # output size = (1, 1)
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+    def _make_layer(self, block, channel, block_num, stride=1):
+        downsample = None
+        if stride != 1 or self.in_channel != channel * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(channel * block.expansion))
+        layers = []
+        layers.append(block(self.in_channel, channel, downsample=downsample, stride=stride))
+        self.in_channel = channel * block.expansion
+        for _ in range(1, block_num):
+            layers.append(block(self.in_channel, channel))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        if self.include_top:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+        return x
+def resnet50(num_classes=1000, include_top=True):
+    return ResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, include_top=include_top)

sddfrcnn_model/backbone/resnet50_fpn_model.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import os
+import torch.nn as nn
+import torch
+from torchvision.ops.misc import FrozenBatchNorm2d
+from .feature_pyramid_network import LastLevelMaxPool, BackboneWithFPN
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, in_channel, out_channel, stride=1, downsample=None, norm_layer=None):
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.conv1 = nn.Conv2d(in_channels=in_channel, out_channels=out_channel,
+                               kernel_size=1, stride=1, bias=False)  # squeeze channels
+        self.bn1 = norm_layer(out_channel)
+        # -----------------------------------------
+        self.conv2 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel,
+                               kernel_size=3, stride=stride, bias=False, padding=1)
+        self.bn2 = norm_layer(out_channel)
+        # -----------------------------------------
+        self.conv3 = nn.Conv2d(in_channels=out_channel, out_channels=out_channel * self.expansion,
+                               kernel_size=1, stride=1, bias=False)  # unsqueeze channels
+        self.bn3 = norm_layer(out_channel * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+    def forward(self, x):
+        identity = x
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out += identity
+        out = self.relu(out)
+        return out
+class ResNet(nn.Module):
+    def __init__(self, block, blocks_num, num_classes=1000, include_top=True, norm_layer=None):
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.include_top = include_top
+        self.in_channel = 64
+        self.conv1 = nn.Conv2d(3, self.in_channel, kernel_size=7, stride=2,
+                               padding=3, bias=False)
+        self.bn1 = norm_layer(self.in_channel)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, blocks_num[0])
+        self.layer2 = self._make_layer(block, 128, blocks_num[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, blocks_num[2], stride=2)
+        self.layer4 = self._make_layer(block, 512, blocks_num[3], stride=2)
+        if self.include_top:
+            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))  # output size = (1, 1)
+            self.fc = nn.Linear(512 * block.expansion, num_classes)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+    def _make_layer(self, block, channel, block_num, stride=1):
+        norm_layer = self._norm_layer
+        downsample = None
+        if stride != 1 or self.in_channel != channel * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(self.in_channel, channel * block.expansion, kernel_size=1, stride=stride, bias=False),
+                norm_layer(channel * block.expansion))
+        layers = []
+        layers.append(block(self.in_channel, channel, downsample=downsample,
+                            stride=stride, norm_layer=norm_layer))
+        self.in_channel = channel * block.expansion
+        for _ in range(1, block_num):
+            layers.append(block(self.in_channel, channel, norm_layer=norm_layer))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        if self.include_top:
+            x = self.avgpool(x)
+            x = torch.flatten(x, 1)
+            x = self.fc(x)
+        return x
+def overwrite_eps(model, eps):
+    """
+    This method overwrites the default eps values of all the
+    FrozenBatchNorm2d layers of the model with the provided value.
+    This is necessary to address the BC-breaking change introduced
+    by the bug-fix at pytorch/vision#2933. The overwrite is applied
+    only when the pretrained weights are loaded to maintain compatibility
+    with previous versions.
+    Args:
+        model (nn.Module): The model on which we perform the overwrite.
+        eps (float): The new value of eps.
+    """
+    for module in model.modules():
+        if isinstance(module, FrozenBatchNorm2d):
+            module.eps = eps
+def resnet50_fpn_backbone(pretrain_path="",
+                          norm_layer=FrozenBatchNorm2d,   # FrozenBatchNorm2d的功能与BatchNorm2d类似，但参数无法更新
+                          trainable_layers=3,
+                          returned_layers=None,
+                          extra_blocks=None):
+    """
+    搭建resnet50_fpn——backbone
+    Args:
+        pretrain_path: resnet50的预训练权重，如果不使用就默认为空
+        norm_layer: 官方默认的是FrozenBatchNorm2d，即不会更新参数的bn层(因为如果batch_size设置的很小会导致效果更差，还不如不用bn层)
+                    如果自己的GPU显存很大可以设置很大的batch_size，那么自己可以传入正常的BatchNorm2d层
+                    (https://github.com/facebookresearch/maskrcnn-benchmark/issues/267)
+        trainable_layers: 指定训练哪些层结构
+        returned_layers: 指定哪些层的输出需要返回
+        extra_blocks: 在输出的特征层基础上额外添加的层结构
+    Returns:
+    """
+    resnet_backbone = ResNet(Bottleneck, [3, 4, 6, 3],
+                             include_top=False,
+                             norm_layer=norm_layer)
+    if isinstance(norm_layer, FrozenBatchNorm2d):
+        overwrite_eps(resnet_backbone, 0.0)
+    if pretrain_path != "":
+        assert os.path.exists(pretrain_path), "{} is not exist.".format(pretrain_path)
+        # 载入预训练权重
+        print(resnet_backbone.load_state_dict(torch.load(pretrain_path), strict=False))
+    # select layers that wont be frozen
+    assert 0 <= trainable_layers <= 5
+    layers_to_train = ['layer4', 'layer3', 'layer2', 'layer1', 'conv1'][:trainable_layers]
+    # 如果要训练所有层结构的话，不要忘了conv1后还有一个bn1
+    if trainable_layers == 5:
+        layers_to_train.append("bn1")
+    # freeze layers
+    for name, parameter in resnet_backbone.named_parameters():
+        # 只训练不在layers_to_train列表中的层结构
+        if all([not name.startswith(layer) for layer in layers_to_train]):
+            parameter.requires_grad_(False)
+    if extra_blocks is None:
+        extra_blocks = LastLevelMaxPool()
+    if returned_layers is None:
+        returned_layers = [1, 2, 3, 4]
+    # 返回的特征层个数肯定大于0小于5
+    assert min(returned_layers) > 0 and max(returned_layers) < 5
+    # return_layers = {'layer1': '0', 'layer2': '1', 'layer3': '2', 'layer4': '3'}
+    return_layers = {f'layer{k}': str(v) for v, k in enumerate(returned_layers)}
+    # in_channel 为layer4的输出特征矩阵channel = 2048
+    in_channels_stage2 = resnet_backbone.in_channel // 8  # 256
+    # 记录resnet50提供给fpn的特征层channels
+    in_channels_list = [in_channels_stage2 * 2 ** (i - 1) for i in returned_layers]
+    # 通过fpn后得到的每个特征层的channel
+    out_channels = 256
+    return BackboneWithFPN(resnet_backbone, return_layers, in_channels_list, out_channels, extra_blocks=extra_blocks)

sddfrcnn_model/backbone/ssd_model.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import torch
+from torch import nn, Tensor
+from torch.jit.annotations import List
+from .res50_backbone import resnet50
+from .utils import dboxes300_coco, Encoder, PostProcess
+class Backbone(nn.Module):
+    def __init__(self, pretrain_path=None):
+        super(Backbone, self).__init__()
+        net = resnet50()
+        self.out_channels = [1024, 512, 512, 256, 256, 256]
+        if pretrain_path is not None:
+            net.load_state_dict(torch.load(pretrain_path))
+        self.feature_extractor = nn.Sequential(*list(net.children())[:7])
+        conv4_block1 = self.feature_extractor[-1][0]
+        # 修改conv4_block1的步距，从2->1
+        conv4_block1.conv1.stride = (1, 1)
+        conv4_block1.conv2.stride = (1, 1)
+        conv4_block1.downsample[0].stride = (1, 1)
+    def forward(self, x):
+        x = self.feature_extractor(x)
+        return x
+class SSD300(nn.Module):
+    def __init__(self, backbone=None, num_classes=21):
+        super(SSD300, self).__init__()
+        if backbone is None:
+            raise Exception("backbone is None")
+        if not hasattr(backbone, "out_channels"):
+            raise Exception("the backbone not has attribute: out_channel")
+        self.feature_extractor = backbone
+        self.num_classes = num_classes
+        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
+        self._build_additional_features(self.feature_extractor.out_channels)
+        self.num_defaults = [4, 6, 6, 6, 4, 4]
+        location_extractors = []
+        confidence_extractors = []
+        # out_channels = [1024, 512, 512, 256, 256, 256] for resnet50
+        for nd, oc in zip(self.num_defaults, self.feature_extractor.out_channels):
+            # nd is number_default_boxes, oc is output_channel
+            location_extractors.append(nn.Conv2d(oc, nd * 4, kernel_size=3, padding=1))
+            confidence_extractors.append(nn.Conv2d(oc, nd * self.num_classes, kernel_size=3, padding=1))
+        self.loc = nn.ModuleList(location_extractors)
+        self.conf = nn.ModuleList(confidence_extractors)
+        self._init_weights()
+        default_box = dboxes300_coco()
+        self.compute_loss = Loss(default_box)
+        self.encoder = Encoder(default_box)
+        self.postprocess = PostProcess(default_box)
+    def _build_additional_features(self, input_size):
+        """
+        为backbone(resnet50)添加额外的一系列卷积层，得到相应的一系列特征提取器
+        :param input_size:
+        :return:
+        """
+        additional_blocks = []
+        # input_size = [1024, 512, 512, 256, 256, 256] for resnet50
+        middle_channels = [256, 256, 128, 128, 128]
+        for i, (input_ch, output_ch, middle_ch) in enumerate(zip(input_size[:-1], input_size[1:], middle_channels)):
+            padding, stride = (1, 2) if i < 3 else (0, 1)
+            layer = nn.Sequential(
+                nn.Conv2d(input_ch, middle_ch, kernel_size=1, bias=False),
+                nn.BatchNorm2d(middle_ch),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(middle_ch, output_ch, kernel_size=3, padding=padding, stride=stride, bias=False),
+                nn.BatchNorm2d(output_ch),
+                nn.ReLU(inplace=True),
+            )
+            additional_blocks.append(layer)
+        self.additional_blocks = nn.ModuleList(additional_blocks)
+    def _init_weights(self):
+        layers = [*self.additional_blocks, *self.loc, *self.conf]
+        for layer in layers:
+            for param in layer.parameters():
+                if param.dim() > 1:
+                    nn.init.xavier_uniform_(param)
+    # Shape the classifier to the view of bboxes
+    def bbox_view(self, features, loc_extractor, conf_extractor):
+        locs = []
+        confs = []
+        for f, l, c in zip(features, loc_extractor, conf_extractor):
+            # [batch, n*4, feat_size, feat_size] -> [batch, 4, -1]
+            locs.append(l(f).view(f.size(0), 4, -1))
+            # [batch, n*classes, feat_size, feat_size] -> [batch, classes, -1]
+            confs.append(c(f).view(f.size(0), self.num_classes, -1))
+        locs, confs = torch.cat(locs, 2).contiguous(), torch.cat(confs, 2).contiguous()
+        return locs, confs
+    def forward(self, image, targets=None):
+        x = self.feature_extractor(image)
+        # Feature Map 38x38x1024, 19x19x512, 10x10x512, 5x5x256, 3x3x256, 1x1x256
+        detection_features = torch.jit.annotate(List[Tensor], [])  # [x]
+        detection_features.append(x)
+        for layer in self.additional_blocks:
+            x = layer(x)
+            detection_features.append(x)
+        # Feature Map 38x38x4, 19x19x6, 10x10x6, 5x5x6, 3x3x4, 1x1x4
+        locs, confs = self.bbox_view(detection_features, self.loc, self.conf)
+        # For SSD 300, shall return nbatch x 8732 x {nlabels, nlocs} results
+        # 38x38x4 + 19x19x6 + 10x10x6 + 5x5x6 + 3x3x4 + 1x1x4 = 8732
+        if self.training:
+            if targets is None:
+                raise ValueError("In training mode, targets should be passed")
+            # bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
+            bboxes_out = targets['boxes']
+            bboxes_out = bboxes_out.transpose(1, 2).contiguous()
+            # print(bboxes_out.is_contiguous())
+            labels_out = targets['labels']
+            # print(labels_out.is_contiguous())
+            # ploc, plabel, gloc, glabel
+            loss = self.compute_loss(locs, confs, bboxes_out, labels_out)
+            return {"total_losses": loss}
+        # 将预测回归参数叠加到default box上得到最终预测box，并执行非极大值抑制虑除重叠框
+        # results = self.encoder.decode_batch(locs, confs)
+        results = self.postprocess(locs, confs)
+        return results
+class Loss(nn.Module):
+    """
+        Implements the loss as the sum of the followings:
+        1. Confidence Loss: All labels, with hard negative mining
+        2. Localization Loss: Only on positive labels
+        Suppose input dboxes has the shape 8732x4
+    """
+    def __init__(self, dboxes):
+        super(Loss, self).__init__()
+        # Two factor are from following links
+        # http://jany.st/post/2017-11-05-single-shot-detector-ssd-from-scratch-in-tensorflow.html
+        self.scale_xy = 1.0 / dboxes.scale_xy  # 10
+        self.scale_wh = 1.0 / dboxes.scale_wh  # 5
+        self.location_loss = nn.SmoothL1Loss(reduction='none')
+        # [num_anchors, 4] -> [4, num_anchors] -> [1, 4, num_anchors]
+        self.dboxes = nn.Parameter(dboxes(order="xywh").transpose(0, 1).unsqueeze(dim=0),
+                                   requires_grad=False)
+        self.confidence_loss = nn.CrossEntropyLoss(reduction='none')
+    def _location_vec(self, loc):
+        # type: (Tensor) -> Tensor
+        """
+        Generate Location Vectors
+        计算ground truth相对anchors的回归参数
+        :param loc: anchor匹配到的对应GTBOX Nx4x8732
+        :return:
+        """
+        gxy = self.scale_xy * (loc[:, :2, :] - self.dboxes[:, :2, :]) / self.dboxes[:, 2:, :]  # Nx2x8732
+        gwh = self.scale_wh * (loc[:, 2:, :] / self.dboxes[:, 2:, :]).log()  # Nx2x8732
+        return torch.cat((gxy, gwh), dim=1).contiguous()
+    def forward(self, ploc, plabel, gloc, glabel):
+        # type: (Tensor, Tensor, Tensor, Tensor) -> Tensor
+        """
+            ploc, plabel: Nx4x8732, Nxlabel_numx8732
+                predicted location and labels
+            gloc, glabel: Nx4x8732, Nx8732
+                ground truth location and labels
+        """
+        # 获取正样本的mask  Tensor: [N, 8732]
+        mask = torch.gt(glabel, 0)  # (gt: >)
+        # mask1 = torch.nonzero(glabel)
+        # 计算一个batch中的每张图片的正样本个数 Tensor: [N]
+        pos_num = mask.sum(dim=1)
+        # 计算gt的location回归参数 Tensor: [N, 4, 8732]
+        vec_gd = self._location_vec(gloc)
+        # sum on four coordinates, and mask
+        # 计算定位损失(只有正样本)
+        loc_loss = self.location_loss(ploc, vec_gd).sum(dim=1)  # Tensor: [N, 8732]
+        loc_loss = (mask.float() * loc_loss).sum(dim=1)  # Tenosr: [N]
+        # hard negative mining Tenosr: [N, 8732]
+        con = self.confidence_loss(plabel, glabel)
+        # positive mask will never selected
+        # 获取负样本
+        con_neg = con.clone()
+        con_neg[mask] = 0.0
+        # 按照confidence_loss降序排列 con_idx(Tensor: [N, 8732])
+        _, con_idx = con_neg.sort(dim=1, descending=True)
+        _, con_rank = con_idx.sort(dim=1)  # 这个步骤比较巧妙
+        # number of negative three times positive
+        # 用于损失计算的负样本数是正样本的3倍（在原论文Hard negative mining部分），
+        # 但不能超过总样本数8732
+        neg_num = torch.clamp(3 * pos_num, max=mask.size(1)).unsqueeze(-1)
+        neg_mask = torch.lt(con_rank, neg_num)  # (lt: <) Tensor [N, 8732]
+        # confidence最终loss使用选取的正样本loss+选取的负样本loss
+        con_loss = (con * (mask.float() + neg_mask.float())).sum(dim=1)  # Tensor [N]
+        # avoid no object detected
+        # 避免出现图像中没有GTBOX的情况
+        total_loss = loc_loss + con_loss
+        # eg. [15, 3, 5, 0] -> [1.0, 1.0, 1.0, 0.0]
+        num_mask = torch.gt(pos_num, 0).float()  # 统计一个batch中的每张图像中是否存在正样本
+        pos_num = pos_num.float().clamp(min=1e-6)  # 防止出现分母为零的情况
+        ret = (total_loss * num_mask / pos_num).mean(dim=0)  # 只计算存在正样本的图像损失
+        return ret

sddfrcnn_model/backbone/utils.py ADDED Viewed

	@@ -0,0 +1,628 @@

+from math import sqrt
+import itertools
+import torch
+import torch.nn.functional as F
+from torch.jit.annotations import Tuple, List
+from torch import nn, Tensor
+import numpy as np
+# This function is from https://github.com/kuangliu/pytorch-ssd.
+# def calc_iou_tensor(box1, box2):
+#     """ Calculation of IoU based on two boxes tensor,
+#         Reference to https://github.com/kuangliu/pytorch-src
+#         input:
+#             box1 (N, 4)  format [xmin, ymin, xmax, ymax]
+#             box2 (M, 4)  format [xmin, ymin, xmax, ymax]
+#         output:
+#             IoU (N, M)
+#     """
+#     N = box1.size(0)
+#     M = box2.size(0)
+#
+#     # (N, 4) -> (N, 1, 4) -> (N, M, 4)
+#     be1 = box1.unsqueeze(1).expand(-1, M, -1)  # -1 means not changing the size of that dimension
+#     # (M, 4) -> (1, M, 4) -> (N, M, 4)
+#     be2 = box2.unsqueeze(0).expand(N, -1, -1)
+#
+#     # Left Top and Right Bottom
+#     lt = torch.max(be1[:, :, :2], be2[:, :, :2])
+#     rb = torch.min(be1[:, :, 2:], be2[:, :, 2:])
+#
+#     # compute intersection area
+#     delta = rb - lt  # width and height
+#     delta[delta < 0] = 0
+#     # width * height
+#     intersect = delta[:, :, 0] * delta[:, :, 1]
+#
+#     # compute bel1 area
+#     delta1 = be1[:, :, 2:] - be1[:, :, :2]
+#     area1 = delta1[:, :, 0] * delta1[:, :, 1]
+#     # compute bel2 area
+#     delta2 = be2[:, :, 2:] - be2[:, :, :2]
+#     area2 = delta2[:, :, 0] * delta2[:, :, 1]
+#
+#     iou = intersect / (area1 + area2 - intersect)
+#     return iou
+def box_area(boxes):
+    """
+    Computes the area of a set of bounding boxes, which are specified by its
+    (x1, y1, x2, y2) coordinates.
+    Arguments:
+        boxes (Tensor[N, 4]): boxes for which the area will be computed. They
+            are expected to be in (x1, y1, x2, y2) format
+    Returns:
+        area (Tensor[N]): area for each box
+    """
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+def calc_iou_tensor(boxes1, boxes2):
+    """
+    Return intersection-over-union (Jaccard index) of boxes.
+    Both sets of boxes are expected to be in (x1, y1, x2, y2) format.
+    Arguments:
+        boxes1 (Tensor[N, 4])
+        boxes2 (Tensor[M, 4])
+    Returns:
+        iou (Tensor[N, M]): the NxM matrix containing the pairwise
+            IoU values for every element in boxes1 and boxes2
+    """
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+    #  When the shapes do not match,
+    #  the shape of the returned output tensor follows the broadcasting rules
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # left-top [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # right-bottom [N,M,2]
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    iou = inter / (area1[:, None] + area2 - inter)
+    return iou
+# This function is from https://github.com/kuangliu/pytorch-ssd.
+class Encoder(object):
+    """
+        Inspired by https://github.com/kuangliu/pytorch-src
+        Transform between (bboxes, lables) <-> SSD output
+        dboxes: default boxes in size 8732 x 4,
+            encoder: input ltrb format, output xywh format
+            decoder: input xywh format, output ltrb format
+        encode:
+            input  : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
+            output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
+            criteria : IoU threshold of bboexes
+        decode:
+            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+            criteria : IoU threshold of bboexes
+            max_output : maximum number of output bboxes
+    """
+    def __init__(self, dboxes):
+        self.dboxes = dboxes(order='ltrb')
+        self.dboxes_xywh = dboxes(order='xywh').unsqueeze(dim=0)
+        self.nboxes = self.dboxes.size(0)  # default boxes的数量
+        self.scale_xy = dboxes.scale_xy
+        self.scale_wh = dboxes.scale_wh
+    def encode(self, bboxes_in, labels_in, criteria=0.5):
+        """
+        encode:
+            input  : bboxes_in (Tensor nboxes x 4), labels_in (Tensor nboxes)
+            output : bboxes_out (Tensor 8732 x 4), labels_out (Tensor 8732)
+            criteria : IoU threshold of bboexes
+        """
+        # [nboxes, 8732]
+        ious = calc_iou_tensor(bboxes_in, self.dboxes)  # 计算每个GT与default box的iou
+        # [8732,]
+        best_dbox_ious, best_dbox_idx = ious.max(dim=0)  # 寻找每个default box匹配到的最大IoU
+        # [nboxes,]
+        best_bbox_ious, best_bbox_idx = ious.max(dim=1)  # 寻找每个GT匹配到的最大IoU
+        # 将每个GT匹配到的最佳default box设置为正样本（对应论文中Matching strategy的第一条）
+        # set best ious 2.0
+        best_dbox_ious.index_fill_(0, best_bbox_idx, 2.0)  # dim, index, value
+        # 将相应default box匹配最大IOU的GT索引进行替换
+        idx = torch.arange(0, best_bbox_idx.size(0), dtype=torch.int64)
+        best_dbox_idx[best_bbox_idx[idx]] = idx
+        # filter IoU > 0.5
+        # 寻找与GT iou大于0.5的default box,对应论文中Matching strategy的第二条(这里包括了第一条匹配到的信息)
+        masks = best_dbox_ious > criteria
+        # [8732,]
+        labels_out = torch.zeros(self.nboxes, dtype=torch.int64)
+        labels_out[masks] = labels_in[best_dbox_idx[masks]]
+        # 将default box匹配到正样本的位置设置成对应GT的box信息
+        bboxes_out = self.dboxes.clone()
+        bboxes_out[masks, :] = bboxes_in[best_dbox_idx[masks], :]
+        # Transform format to xywh format
+        x = 0.5 * (bboxes_out[:, 0] + bboxes_out[:, 2])  # x
+        y = 0.5 * (bboxes_out[:, 1] + bboxes_out[:, 3])  # y
+        w = bboxes_out[:, 2] - bboxes_out[:, 0]  # w
+        h = bboxes_out[:, 3] - bboxes_out[:, 1]  # h
+        bboxes_out[:, 0] = x
+        bboxes_out[:, 1] = y
+        bboxes_out[:, 2] = w
+        bboxes_out[:, 3] = h
+        return bboxes_out, labels_out
+    def scale_back_batch(self, bboxes_in, scores_in):
+        """
+            将box格式从xywh转换回ltrb, 将预测目标score通过softmax处理
+            Do scale and transform from xywh to ltrb
+            suppose input N x 4 x num_bbox | N x label_num x num_bbox
+            bboxes_in: 是网络预测的xywh回归参数
+            scores_in: 是预测的每个default box的各目标概率
+        """
+        if bboxes_in.device == torch.device("cpu"):
+            self.dboxes = self.dboxes.cpu()
+            self.dboxes_xywh = self.dboxes_xywh.cpu()
+        else:
+            self.dboxes = self.dboxes.cuda()
+            self.dboxes_xywh = self.dboxes_xywh.cuda()
+        # Returns a view of the original tensor with its dimensions permuted.
+        bboxes_in = bboxes_in.permute(0, 2, 1)
+        scores_in = scores_in.permute(0, 2, 1)
+        # print(bboxes_in.is_contiguous())
+        bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2]   # 预测的x, y回归参数
+        bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:]   # 预测的w, h回归参数
+        # 将预测的回归参数叠加到default box上得到最终的预测边界框
+        bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
+        bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
+        # transform format to ltrb
+        l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
+        t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
+        r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
+        b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]
+        bboxes_in[:, :, 0] = l  # xmin
+        bboxes_in[:, :, 1] = t  # ymin
+        bboxes_in[:, :, 2] = r  # xmax
+        bboxes_in[:, :, 3] = b  # ymax
+        return bboxes_in, F.softmax(scores_in, dim=-1)
+    def decode_batch(self, bboxes_in, scores_in, criteria=0.45, max_output=200):
+        # 将box格式从xywh转换回ltrb（方便后面非极大值抑制时求iou）, 将预测目标score通过softmax处理
+        bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
+        outputs = []
+        # 遍历一个batch中的每张image数据
+        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):
+            bbox = bbox.squeeze(0)
+            prob = prob.squeeze(0)
+            outputs.append(self.decode_single_new(bbox, prob, criteria, max_output))
+        return outputs
+    def decode_single_new(self, bboxes_in, scores_in, criteria, num_output=200):
+        """
+        decode:
+            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+            criteria : IoU threshold of bboexes
+            max_output : maximum number of output bboxes
+        """
+        device = bboxes_in.device
+        num_classes = scores_in.shape[-1]
+        # 对越界的bbox进行裁剪
+        bboxes_in = bboxes_in.clamp(min=0, max=1)
+        # [8732, 4] -> [8732, 21, 4]
+        bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)
+        # create labels for each prediction
+        labels = torch.arange(num_classes, device=device)
+        labels = labels.view(1, -1).expand_as(scores_in)
+        # remove prediction with the background label
+        # 移除归为背景类别的概率信息
+        bboxes_in = bboxes_in[:, 1:, :]
+        scores_in = scores_in[:, 1:]
+        labels = labels[:, 1:]
+        # batch everything, by making every class prediction be a separate instance
+        bboxes_in = bboxes_in.reshape(-1, 4)
+        scores_in = scores_in.reshape(-1)
+        labels = labels.reshape(-1)
+        # remove low scoring boxes
+        # 移除低概率目标，self.scores_thresh=0.05
+        inds = torch.nonzero(scores_in > 0.05, as_tuple=False).squeeze(1)
+        bboxes_in, scores_in, labels = bboxes_in[inds], scores_in[inds], labels[inds]
+        # remove empty boxes
+        ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
+        keep = (ws >= 0.1 / 300) & (hs >= 0.1 / 300)
+        keep = keep.nonzero(as_tuple=False).squeeze(1)
+        bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]
+        # non-maximum suppression
+        keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)
+        # keep only topk scoring predictions
+        keep = keep[:num_output]
+        bboxes_out = bboxes_in[keep, :]
+        scores_out = scores_in[keep]
+        labels_out = labels[keep]
+        return bboxes_out, labels_out, scores_out
+    # perform non-maximum suppression
+    def decode_single(self, bboxes_in, scores_in, criteria, max_output, max_num=200):
+        """
+        decode:
+            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+            criteria : IoU threshold of bboexes
+            max_output : maximum number of output bboxes
+        """
+        # Reference to https://github.com/amdegroot/ssd.pytorch
+        bboxes_out = []
+        scores_out = []
+        labels_out = []
+        # 非极大值抑制算法
+        # scores_in (Tensor 8732 x nitems), 遍历返回每一列数据，即8732个目标的同一类别的概率
+        for i, score in enumerate(scores_in.split(1, 1)):
+            # skip background
+            if i == 0:
+                continue
+            # [8732, 1] -> [8732]
+            score = score.squeeze(1)
+            # 虑除预测概率小于0.05的目标
+            mask = score > 0.05
+            bboxes, score = bboxes_in[mask, :], score[mask]
+            if score.size(0) == 0:
+                continue
+            # 按照分数从小到大排序
+            score_sorted, score_idx_sorted = score.sort(dim=0)
+            # select max_output indices
+            score_idx_sorted = score_idx_sorted[-max_num:]
+            candidates = []
+            while score_idx_sorted.numel() > 0:
+                idx = score_idx_sorted[-1].item()
+                # 获取排名前score_idx_sorted名的bboxes信息 Tensor:[score_idx_sorted, 4]
+                bboxes_sorted = bboxes[score_idx_sorted, :]
+                # 获取排名第一的bboxes信息 Tensor:[4]
+                bboxes_idx = bboxes[idx, :].unsqueeze(dim=0)
+                # 计算前score_idx_sorted名的bboxes与第一名的bboxes的iou
+                iou_sorted = calc_iou_tensor(bboxes_sorted, bboxes_idx).squeeze()
+                # we only need iou < criteria
+                # 丢弃与第一名iou > criteria的所有目标(包括自己本身)
+                score_idx_sorted = score_idx_sorted[iou_sorted < criteria]
+                # 保存第一名的索引信息
+                candidates.append(idx)
+            # 保存该类别通过非极大值抑制后的目标信息
+            bboxes_out.append(bboxes[candidates, :])   # bbox坐标信息
+            scores_out.append(score[candidates])       # score信息
+            labels_out.extend([i] * len(candidates))   # 标签信息
+        if not bboxes_out:  # 如果为空的话，返回空tensor，注意boxes对应的空tensor size，防止验证时出错
+            return [torch.empty(size=(0, 4)), torch.empty(size=(0,), dtype=torch.int64), torch.empty(size=(0,))]
+        bboxes_out = torch.cat(bboxes_out, dim=0).contiguous()
+        scores_out = torch.cat(scores_out, dim=0).contiguous()
+        labels_out = torch.as_tensor(labels_out, dtype=torch.long)
+        # 对所有目标的概率进行排序（无论是什 么类别）,取前max_num个目标
+        _, max_ids = scores_out.sort(dim=0)
+        max_ids = max_ids[-max_output:]
+        return bboxes_out[max_ids, :], labels_out[max_ids], scores_out[max_ids]
+class DefaultBoxes(object):
+    def __init__(self, fig_size, feat_size, steps, scales, aspect_ratios, scale_xy=0.1, scale_wh=0.2):
+        self.fig_size = fig_size   # 输入网络的图像大小 300
+        # [38, 19, 10, 5, 3, 1]
+        self.feat_size = feat_size  # 每个预测层的feature map尺寸
+        self.scale_xy_ = scale_xy
+        self.scale_wh_ = scale_wh
+        # According to https://github.com/weiliu89/caffe
+        # Calculation method slightly different from paper
+        # [8, 16, 32, 64, 100, 300]
+        self.steps = steps    # 每个特征层上的一个cell在原图上的跨度
+        # [21, 45, 99, 153, 207, 261, 315]
+        self.scales = scales  # 每个特征层上预测的default box的scale
+        fk = fig_size / np.array(steps)     # 计算每层特征层的fk
+        # [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
+        self.aspect_ratios = aspect_ratios  # 每个预测特征层上预测的default box的ratios
+        self.default_boxes = []
+        # size of feature and number of feature
+        # 遍历每层特征层，计算default box
+        for idx, sfeat in enumerate(self.feat_size):
+            sk1 = scales[idx] / fig_size  # scale转为相对值[0-1]
+            sk2 = scales[idx + 1] / fig_size  # scale转为相对值[0-1]
+            sk3 = sqrt(sk1 * sk2)
+            # 先添加两个1:1比例的default box宽和高
+            all_sizes = [(sk1, sk1), (sk3, sk3)]
+            # 再将剩下不同比例的default box宽和高添加到all_sizes中
+            for alpha in aspect_ratios[idx]:
+                w, h = sk1 * sqrt(alpha), sk1 / sqrt(alpha)
+                all_sizes.append((w, h))
+                all_sizes.append((h, w))
+            # 计算当前特征层对应原图上的所有default box
+            for w, h in all_sizes:
+                for i, j in itertools.product(range(sfeat), repeat=2):  # i -> 行（y）， j -> 列（x）
+                    # 计算每个default box的中心坐标（范围是在0-1之间）
+                    cx, cy = (j + 0.5) / fk[idx], (i + 0.5) / fk[idx]
+                    self.default_boxes.append((cx, cy, w, h))
+        # 将default_boxes转为tensor格式
+        self.dboxes = torch.as_tensor(self.default_boxes, dtype=torch.float32)  # 这里不转类型会报错
+        self.dboxes.clamp_(min=0, max=1)  # 将坐标（x, y, w, h）都限制在0-1之间
+        # For IoU calculation
+        # ltrb is left top coordinate and right bottom coordinate
+        # 将(x, y, w, h)转换成(xmin, ymin, xmax, ymax)，方便后续计算IoU(匹配正负样本时)
+        self.dboxes_ltrb = self.dboxes.clone()
+        self.dboxes_ltrb[:, 0] = self.dboxes[:, 0] - 0.5 * self.dboxes[:, 2]   # xmin
+        self.dboxes_ltrb[:, 1] = self.dboxes[:, 1] - 0.5 * self.dboxes[:, 3]   # ymin
+        self.dboxes_ltrb[:, 2] = self.dboxes[:, 0] + 0.5 * self.dboxes[:, 2]   # xmax
+        self.dboxes_ltrb[:, 3] = self.dboxes[:, 1] + 0.5 * self.dboxes[:, 3]   # ymax
+    @property
+    def scale_xy(self):
+        return self.scale_xy_
+    @property
+    def scale_wh(self):
+        return self.scale_wh_
+    def __call__(self, order='ltrb'):
+        # 根据需求返回对应格式的default box
+        if order == 'ltrb':
+            return self.dboxes_ltrb
+        if order == 'xywh':
+            return self.dboxes
+def dboxes300_coco():
+    figsize = 300  # 输入网络的图像大小
+    feat_size = [38, 19, 10, 5, 3, 1]   # 每个预测层的feature map尺寸
+    steps = [8, 16, 32, 64, 100, 300]   # 每个特征层上的一个cell在原图上的跨度
+    # use the scales here: https://github.com/amdegroot/ssd.pytorch/blob/master/data/config.py
+    scales = [21, 45, 99, 153, 207, 261, 315]  # 每个特征层上预测的default box的scale
+    aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]  # 每个预测特征层上预测的default box的ratios
+    dboxes = DefaultBoxes(figsize, feat_size, steps, scales, aspect_ratios)
+    return dboxes
+def nms(boxes, scores, iou_threshold):
+    # type: (Tensor, Tensor, float) -> Tensor
+    """
+    Performs non-maximum suppression (NMS) on the boxes according
+    to their intersection-over-union (IoU).
+    NMS iteratively removes lower scoring boxes which have an
+    IoU greater than iou_threshold with another (higher scoring)
+    box.
+    Parameters
+    ----------
+    boxes : Tensor[N, 4])
+        boxes to perform NMS on. They
+        are expected to be in (x1, y1, x2, y2) format
+    scores : Tensor[N]
+        scores for each one of the boxes
+    iou_threshold : float
+        discards all overlapping
+        boxes with IoU < iou_threshold
+    Returns
+    -------
+    keep : Tensor
+        int64 tensor with the indices
+        of the elements that have been kept
+        by NMS, sorted in decreasing order of scores
+    """
+    return torch.ops.torchvision.nms(boxes, scores, iou_threshold)
+def batched_nms(boxes, scores, idxs, iou_threshold):
+    # type: (Tensor, Tensor, Tensor, float) -> Tensor
+    """
+    Performs non-maximum suppression in a batched fashion.
+    Each index value correspond to a category, and NMS
+    will not be applied between elements of different categories.
+    Parameters
+    ----------
+    boxes : Tensor[N, 4]
+        boxes where NMS will be performed. They
+        are expected to be in (x1, y1, x2, y2) format
+    scores : Tensor[N]
+        scores for each one of the boxes
+    idxs : Tensor[N]
+        indices of the categories for each one of the boxes.
+    iou_threshold : float
+        discards all overlapping boxes
+        with IoU < iou_threshold
+    Returns
+    -------
+    keep : Tensor
+        int64 tensor with the indices of
+        the elements that have been kept by NMS, sorted
+        in decreasing order of scores
+    """
+    if boxes.numel() == 0:
+        return torch.empty((0,), dtype=torch.int64, device=boxes.device)
+    # strategy: in order to perform NMS independently per class.
+    # we add an offset to all the boxes. The offset is dependent
+    # only on the class idx, and is large enough so that boxes
+    # from different classes do not overlap
+    # 获取所有boxes中最大的坐标值（xmin, ymin, xmax, ymax）
+    max_coordinate = boxes.max()
+    # to(): Performs Tensor dtype and/or device conversion
+    # 为每一个类别生成一个很大的偏移量
+    # 这里的to只是让生成tensor的dytpe和device与boxes保持一致
+    offsets = idxs.to(boxes) * (max_coordinate + 1)
+    # boxes加上对应层的偏移量后，保证不同类别之间boxes不会有重合的现象
+    boxes_for_nms = boxes + offsets[:, None]
+    keep = nms(boxes_for_nms, scores, iou_threshold)
+    return keep
+class PostProcess(nn.Module):
+    def __init__(self, dboxes):
+        super(PostProcess, self).__init__()
+        # [num_anchors, 4] -> [1, num_anchors, 4]
+        self.dboxes_xywh = nn.Parameter(dboxes(order='xywh').unsqueeze(dim=0),
+                                        requires_grad=False)
+        self.scale_xy = dboxes.scale_xy  # 0.1
+        self.scale_wh = dboxes.scale_wh  # 0.2
+        self.criteria = 0.5
+        self.max_output = 100
+    def scale_back_batch(self, bboxes_in, scores_in):
+        # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor]
+        """
+            1）通过预测的boxes回归参数得到最终预测坐标
+            2）将box格式从xywh转换回ltrb
+            3）将预测目标score通过softmax处理
+            Do scale and transform from xywh to ltrb
+            suppose input N x 4 x num_bbox | N x label_num x num_bbox
+            bboxes_in: [N, 4, 8732]是网络预测的xywh回归参数
+            scores_in: [N, label_num, 8732]是预测的每个default box的各目标概率
+        """
+        # Returns a view of the original tensor with its dimensions permuted.
+        # [batch, 4, 8732] -> [batch, 8732, 4]
+        bboxes_in = bboxes_in.permute(0, 2, 1)
+        # [batch, label_num, 8732] -> [batch, 8732, label_num]
+        scores_in = scores_in.permute(0, 2, 1)
+        # print(bboxes_in.is_contiguous())
+        bboxes_in[:, :, :2] = self.scale_xy * bboxes_in[:, :, :2]   # 预测的x, y回归参数
+        bboxes_in[:, :, 2:] = self.scale_wh * bboxes_in[:, :, 2:]   # 预测的w, h回归参数
+        # 将预测的回归参数叠加到default box上得到最终的预测边界框
+        bboxes_in[:, :, :2] = bboxes_in[:, :, :2] * self.dboxes_xywh[:, :, 2:] + self.dboxes_xywh[:, :, :2]
+        bboxes_in[:, :, 2:] = bboxes_in[:, :, 2:].exp() * self.dboxes_xywh[:, :, 2:]
+        # transform format to ltrb
+        l = bboxes_in[:, :, 0] - 0.5 * bboxes_in[:, :, 2]
+        t = bboxes_in[:, :, 1] - 0.5 * bboxes_in[:, :, 3]
+        r = bboxes_in[:, :, 0] + 0.5 * bboxes_in[:, :, 2]
+        b = bboxes_in[:, :, 1] + 0.5 * bboxes_in[:, :, 3]
+        bboxes_in[:, :, 0] = l  # xmin
+        bboxes_in[:, :, 1] = t  # ymin
+        bboxes_in[:, :, 2] = r  # xmax
+        bboxes_in[:, :, 3] = b  # ymax
+        # scores_in: [batch, 8732, label_num]
+        return bboxes_in, F.softmax(scores_in, dim=-1)
+    def decode_single_new(self, bboxes_in, scores_in, criteria, num_output):
+        # type: (Tensor, Tensor, float, int) -> Tuple[Tensor, Tensor, Tensor]
+        """
+        decode:
+            input  : bboxes_in (Tensor 8732 x 4), scores_in (Tensor 8732 x nitems)
+            output : bboxes_out (Tensor nboxes x 4), labels_out (Tensor nboxes)
+            criteria : IoU threshold of bboexes
+            max_output : maximum number of output bboxes
+        """
+        device = bboxes_in.device
+        num_classes = scores_in.shape[-1]
+        # 对越界的bbox进行裁剪
+        bboxes_in = bboxes_in.clamp(min=0, max=1)
+        # [8732, 4] -> [8732, 21, 4]
+        bboxes_in = bboxes_in.repeat(1, num_classes).reshape(scores_in.shape[0], -1, 4)
+        # create labels for each prediction
+        labels = torch.arange(num_classes, device=device)
+        # [num_classes] -> [8732, num_classes]
+        labels = labels.view(1, -1).expand_as(scores_in)
+        # remove prediction with the background label
+        # 移除归为背景类别的概率信息
+        bboxes_in = bboxes_in[:, 1:, :]  # [8732, 21, 4] -> [8732, 20, 4]
+        scores_in = scores_in[:, 1:]  # [8732, 21] -> [8732, 20]
+        labels = labels[:, 1:]  # [8732, 21] -> [8732, 20]
+        # batch everything, by making every class prediction be a separate instance
+        bboxes_in = bboxes_in.reshape(-1, 4)  # [8732, 20, 4] -> [8732x20, 4]
+        scores_in = scores_in.reshape(-1)  # [8732, 20] -> [8732x20]
+        labels = labels.reshape(-1)  # [8732, 20] -> [8732x20]
+        # remove low scoring boxes
+        # 移除低概率目标，self.scores_thresh=0.05
+        # inds = torch.nonzero(scores_in > 0.05).squeeze(1)
+        inds = torch.where(torch.gt(scores_in, 0.05))[0]
+        bboxes_in, scores_in, labels = bboxes_in[inds, :], scores_in[inds], labels[inds]
+        # remove empty boxes
+        ws, hs = bboxes_in[:, 2] - bboxes_in[:, 0], bboxes_in[:, 3] - bboxes_in[:, 1]
+        keep = (ws >= 1 / 300) & (hs >= 1 / 300)
+        # keep = keep.nonzero().squeeze(1)
+        keep = torch.where(keep)[0]
+        bboxes_in, scores_in, labels = bboxes_in[keep], scores_in[keep], labels[keep]
+        # non-maximum suppression
+        keep = batched_nms(bboxes_in, scores_in, labels, iou_threshold=criteria)
+        # keep only topk scoring predictions
+        keep = keep[:num_output]
+        bboxes_out = bboxes_in[keep, :]
+        scores_out = scores_in[keep]
+        labels_out = labels[keep]
+        return bboxes_out, labels_out, scores_out
+    def forward(self, bboxes_in, scores_in):
+        # 通过预测的boxes回归参数得到最终预测坐标, 将预测目标score通过softmax处理
+        bboxes, probs = self.scale_back_batch(bboxes_in, scores_in)
+        outputs = torch.jit.annotate(List[Tuple[Tensor, Tensor, Tensor]], [])
+        # 遍历一个batch中的每张image数据
+        # bboxes: [batch, 8732, 4]
+        for bbox, prob in zip(bboxes.split(1, 0), probs.split(1, 0)):  # split_size, split_dim
+            # bbox: [1, 8732, 4]
+            bbox = bbox.squeeze(0)
+            prob = prob.squeeze(0)
+            outputs.append(self.decode_single_new(bbox, prob, self.criteria, self.max_output))
+        return outputs

sddfrcnn_model/draw_box_utils.py ADDED Viewed

	@@ -0,0 +1,197 @@

+from PIL.Image import Image, fromarray
+import PIL.ImageDraw as ImageDraw
+import PIL.ImageFont as ImageFont
+from PIL import ImageColor
+import numpy as np
+STANDARD_COLORS = [
+    'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
+    'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
+    'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
+    'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
+    'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
+    'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
+    'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
+    'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
+    'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
+    'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
+    'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
+    'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
+    'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
+    'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
+    'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
+    'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
+    'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
+    'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
+    'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
+    'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
+    'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
+    'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
+    'WhiteSmoke', 'Yellow', 'YellowGreen'
+]
+def draw_text(draw,
+              box: list,
+              cls: int,
+              score: float,
+              category_index: dict,
+              color: str,
+              font: str = 'arial.ttf',
+              font_size: int = 24):
+    """
+    将目标边界框和类别信息绘制到图片上
+    """
+    try:
+        font = ImageFont.truetype(font, font_size)
+    except IOError:
+        font = ImageFont.load_default()
+    left, top, right, bottom = box
+    # If the total height of the display strings added to the top of the bounding
+    # box exceeds the top of the image, stack the strings below the bounding box
+    # instead of above.
+    display_str = f"{category_index[str(cls)]}: {int(100 * score)}%"
+    """
+        display_str_heights = [draw.textsize(ds,font=font)[1] for ds in display_str]
+    """
+    bbox = draw.textbbox((0, 0), display_str, font=font)
+    text_width = bbox[2] - bbox[0]
+    text_height = bbox[3] - bbox[1]
+    # Each display_str has a top and bottom margin of 0.05x.
+    display_str_height = (1 + 2 * 0.05) * text_height
+    if top > display_str_height:
+        text_top = top - display_str_height
+        text_bottom = top
+    else:
+        text_top = bottom
+        text_bottom = bottom + display_str_height
+    margin = np.ceil(0.05 * text_width)
+    text_rect_left = left
+    text_rect_right = left + text_width + 2 * margin
+    # 确保文本矩形不超出图像边界
+    img_width, img_height = draw.im.size
+    if text_rect_right > img_width:
+        text_rect_right = img_width
+        text_rect_left = max(0, img_width - text_width - 2 * margin)
+    if text_bottom > img_height:
+        text_bottom = img_height
+        text_top = max(0, img_height - display_str_height)
+    # 绘制文本背景和文本
+    draw.rectangle([(text_rect_left, text_top),
+                    (text_rect_right, text_bottom)], fill=color)
+    draw.text((text_rect_left + margin, text_top),
+              display_str,
+              fill='black',
+              font=font)
+    '''
+    for ds in display_str:
+        """
+            text_width, text_height = draw.textsize(text, font=font)
+        """
+        bbox = draw.textbbox((0, 0), display_str, font=font)
+        text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
+        margin = np.ceil(0.05 * text_width)
+        draw.rectangle([(left, text_top),
+                        (left + text_width + 2 * margin, text_bottom)], fill=color)
+        draw.text((left + margin, text_top),
+                  ds,
+                  fill='black',
+                  font=font)
+        left += text_width
+    '''
+def draw_masks(image, masks, colors, thresh: float = 0.7, alpha: float = 0.5):
+    np_image = np.array(image)
+    masks = np.where(masks > thresh, True, False)
+    # colors = np.array(colors)
+    img_to_draw = np.copy(np_image)
+    # TODO: There might be a way to vectorize this
+    for mask, color in zip(masks, colors):
+        img_to_draw[mask] = color
+    out = np_image * (1 - alpha) + img_to_draw * alpha
+    return fromarray(out.astype(np.uint8))
+def draw_objs(image: Image,
+              boxes: np.ndarray = None,
+              classes: np.ndarray = None,
+              scores: np.ndarray = None,
+              masks: np.ndarray = None,
+              category_index: dict = None,
+              box_thresh: float = 0.1,
+              mask_thresh: float = 0.5,
+              line_thickness: int = 8,
+              font: str = 'arial.ttf',
+              font_size: int = 24,
+              draw_boxes_on_image: bool = True,
+              draw_masks_on_image: bool = False):
+    """
+    将目标边界框信息，类别信息，mask信息绘制在图片上
+    Args:
+        image: 需要绘制的图片
+        boxes: 目标边界框信息
+        classes: 目标类别信息
+        scores: 目标概率信息
+        masks: 目标mask信息
+        category_index: 类别与名称字典
+        box_thresh: 过滤的概率阈值
+        mask_thresh:
+        line_thickness: 边界框宽度
+        font: 字体类型
+        font_size: 字体大小
+        draw_boxes_on_image:
+        draw_masks_on_image:
+    Returns:
+    """
+    # 过滤掉低概率的目标
+    idxs = np.greater(scores, box_thresh)
+    boxes = boxes[idxs]
+    classes = classes[idxs]
+    scores = scores[idxs]
+    if masks is not None:
+        masks = masks[idxs]
+    if len(boxes) == 0:
+        return image
+    colors = [ImageColor.getrgb(STANDARD_COLORS[cls % len(STANDARD_COLORS)]) for cls in classes]
+    if draw_boxes_on_image:
+        # Draw all boxes onto image.
+        draw = ImageDraw.Draw(image)
+        for box, cls, score, color in zip(boxes, classes, scores, colors):
+            left, top, right, bottom = box
+            # 绘制目标边界框
+            img_width, img_height = image.size
+            left = max(0, min(left, img_width - 1))
+            top = max(0, min(top, img_height - 1))
+            right = max(0, min(right, img_width - 1))
+            bottom = max(0, min(bottom, img_height - 1))
+            # 绘制目标边界框（四条线段，而不是五条）
+            draw.line([(left, top), (right, top)], width=line_thickness, fill=color)  # 上边
+            draw.line([(right, top), (right, bottom)], width=line_thickness, fill=color)  # 右边
+            draw.line([(right, bottom), (left, bottom)], width=line_thickness, fill=color)  # 下边
+            draw.line([(left, bottom), (left, top)], width=line_thickness, fill=color)  # 左边
+            '''
+            draw.line([(left, top), (left, bottom), (right, bottom),
+                       (right, top), (left, top)], width=line_thickness, fill=color)
+            '''
+            # 绘制类别和概率信息
+            draw_text(draw, box.tolist(), int(cls), float(score), category_index, color, font, font_size)
+    if draw_masks_on_image and (masks is not None):
+        # Draw all mask onto image.
+        image = draw_masks(image, masks, colors, mask_thresh)
+    return image

sddfrcnn_model/network_files/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .retinanet import RetinaNet

sddfrcnn_model/network_files/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (212 Bytes). View file

sddfrcnn_model/network_files/__pycache__/anchor_utils.cpython-310.pyc ADDED Viewed

Binary file (5.19 kB). View file

sddfrcnn_model/network_files/__pycache__/boxes.cpython-310.pyc ADDED Viewed

Binary file (5.12 kB). View file

sddfrcnn_model/network_files/__pycache__/det_utils.cpython-310.pyc ADDED Viewed

Binary file (10.6 kB). View file

sddfrcnn_model/network_files/__pycache__/image_list.cpython-310.pyc ADDED Viewed

Binary file (1.15 kB). View file

sddfrcnn_model/network_files/__pycache__/losses.cpython-310.pyc ADDED Viewed

Binary file (1.91 kB). View file

sddfrcnn_model/network_files/__pycache__/retinanet.cpython-310.pyc ADDED Viewed

Binary file (16.2 kB). View file

sddfrcnn_model/network_files/__pycache__/transform.cpython-310.pyc ADDED Viewed

Binary file (8.89 kB). View file

sddfrcnn_model/network_files/anchor_utils.py ADDED Viewed

	@@ -0,0 +1,192 @@

+from typing import List, Optional, Dict
+import torch
+from torch import nn, Tensor
+from .image_list import ImageList
+class AnchorsGenerator(nn.Module):
+    __annotations__ = {
+        "cell_anchors": Optional[List[torch.Tensor]],
+        "_cache": Dict[str, List[torch.Tensor]]
+    }
+    """
+    anchors生成器
+    Module that generates anchors for a set of feature maps and
+    image sizes.
+    The module support computing anchors at multiple sizes and aspect ratios
+    per feature map.
+    sizes and aspect_ratios should have the same number of elements, and it should
+    correspond to the number of feature maps.
+    sizes[i] and aspect_ratios[i] can have an arbitrary number of elements,
+    and AnchorGenerator will output a set of sizes[i] * aspect_ratios[i] anchors
+    per spatial location for feature map i.
+    Arguments:
+        sizes (Tuple[Tuple[int]]):
+        aspect_ratios (Tuple[Tuple[float]]):
+    """
+    def __init__(self, sizes=(128, 256, 512), aspect_ratios=(0.5, 1.0, 2.0)):
+        super(AnchorsGenerator, self).__init__()
+        if not isinstance(sizes[0], (list, tuple)):
+            # TODO change this
+            sizes = tuple((s,) for s in sizes)
+        if not isinstance(aspect_ratios[0], (list, tuple)):
+            aspect_ratios = (aspect_ratios,) * len(sizes)
+        assert len(sizes) == len(aspect_ratios)
+        self.sizes = sizes
+        self.aspect_ratios = aspect_ratios
+        self.cell_anchors = None
+        self._cache = {}
+    def generate_anchors(self, scales, aspect_ratios, dtype=torch.float32, device=torch.device("cpu")):
+        # type: (List[int], List[float], torch.dtype, torch.device) -> Tensor
+        """
+        compute anchor sizes
+        Arguments:
+            scales: sqrt(anchor_area)
+            aspect_ratios: h/w ratios
+            dtype: float32
+            device: cpu/gpu
+        """
+        scales = torch.as_tensor(scales, dtype=dtype, device=device)
+        aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
+        h_ratios = torch.sqrt(aspect_ratios)
+        w_ratios = 1.0 / h_ratios
+        # [r1, r2, r3]' * [s1, s2, s3]
+        # number of elements is len(ratios)*len(scales)
+        ws = (w_ratios[:, None] * scales[None, :]).view(-1)
+        hs = (h_ratios[:, None] * scales[None, :]).view(-1)
+        # left-top, right-bottom coordinate relative to anchor center(0, 0)
+        # 生成的anchors模板都是以（0, 0）为中心的, shape [len(ratios)*len(scales), 4]
+        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
+        return base_anchors.round()  # round 四舍五入
+    def set_cell_anchors(self, dtype, device):
+        # type: (torch.dtype, torch.device) -> None
+        if self.cell_anchors is not None:
+            cell_anchors = self.cell_anchors
+            assert cell_anchors is not None
+            # suppose that all anchors have the same device
+            # which is a valid assumption in the current state of the codebase
+            if cell_anchors[0].device == device:
+                return
+        # 根据提供的sizes和aspect_ratios生成anchors模板
+        # anchors模板都是以(0, 0)为中心的anchor
+        cell_anchors = [
+            self.generate_anchors(sizes, aspect_ratios, dtype, device)
+            for sizes, aspect_ratios in zip(self.sizes, self.aspect_ratios)
+        ]
+        self.cell_anchors = cell_anchors
+    def num_anchors_per_location(self):
+        # 计算每个预测特征层上每个滑动窗口的预测目标数
+        return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
+    # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
+    # output g[i] anchors that are s[i] distance apart in direction i, with the same dimensions as a.
+    def grid_anchors(self, grid_sizes, strides):
+        # type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
+        """
+        anchors position in grid coordinate axis map into origin image
+        计算预测特征图对应原始图像上的所有anchors的坐标
+        Args:
+            grid_sizes: 预测特征矩阵的height和width
+            strides: 预测特征矩阵上一步对应原始图像上的步距
+        """
+        anchors = []
+        cell_anchors = self.cell_anchors
+        assert cell_anchors is not None
+        # 遍历每个预测特征层的grid_size，strides和cell_anchors
+        for size, stride, base_anchors in zip(grid_sizes, strides, cell_anchors):
+            grid_height, grid_width = size
+            stride_height, stride_width = stride
+            device = base_anchors.device
+            # For output anchor, compute [x_center, y_center, x_center, y_center]
+            # shape: [grid_width] 对应原图上的x坐标(列)
+            shifts_x = torch.arange(0, grid_width, dtype=torch.float32, device=device) * stride_width
+            # shape: [grid_height] 对应原图上的y坐标(行)
+            shifts_y = torch.arange(0, grid_height, dtype=torch.float32, device=device) * stride_height
+            # 计算预测特征矩阵上每个点对应原图上的坐标(anchors模板的坐标偏移量)
+            # torch.meshgrid函数分别传入行坐标和列坐标，生成网格行坐标矩阵和网格列坐标矩阵
+            # shape: [grid_height, grid_width]
+            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
+            shift_x = shift_x.reshape(-1)
+            shift_y = shift_y.reshape(-1)
+            # 计算anchors坐标(xmin, ymin, xmax, ymax)在原图上的坐标偏移量
+            # shape: [grid_width*grid_height, 4]
+            shifts = torch.stack([shift_x, shift_y, shift_x, shift_y], dim=1)
+            # For every (base anchor, output anchor) pair,
+            # offset each zero-centered base anchor by the center of the output anchor.
+            # 将anchors模板与原图上的坐标偏移量相加得到原图上所有anchors的坐标信息(shape不同时会使用广播机制)
+            shifts_anchor = shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)
+            anchors.append(shifts_anchor.reshape(-1, 4))
+        return anchors  # List[Tensor(all_num_anchors, 4)]
+    def cached_grid_anchors(self, grid_sizes, strides):
+        # type: (List[List[int]], List[List[Tensor]]) -> List[Tensor]
+        """将计算得到的所有anchors信息进行缓存"""
+        key = str(grid_sizes) + str(strides)
+        # self._cache是字典类型
+        if key in self._cache:
+            return self._cache[key]
+        anchors = self.grid_anchors(grid_sizes, strides)
+        self._cache[key] = anchors
+        return anchors
+    def forward(self, image_list, feature_maps):
+        # type: (ImageList, List[Tensor]) -> List[Tensor]
+        # 获取每个预测特征层的尺寸(height, width)
+        grid_sizes = list([feature_map.shape[-2:] for feature_map in feature_maps])
+        # 获取输入图像的height和width
+        image_size = image_list.tensors.shape[-2:]
+        # 获取变量类型和设备类型
+        dtype, device = feature_maps[0].dtype, feature_maps[0].device
+        # one step in feature map equate n pixel stride in origin image
+        # 计算特征层上的一步等于原始图像上的步长
+        strides = [[torch.tensor(image_size[0] // g[0], dtype=torch.int64, device=device),
+                    torch.tensor(image_size[1] // g[1], dtype=torch.int64, device=device)] for g in grid_sizes]
+        # 根据提供的sizes和aspect_ratios生成anchors模板
+        self.set_cell_anchors(dtype, device)
+        # 计算/读取所有anchors的坐标信息（这里的anchors信息是映射到原图上的所有anchors信息，不是anchors模板）
+        # 得到的是一个list列表，对应每张预测特征图映射回原图的anchors坐标信息
+        anchors_over_all_feature_maps = self.cached_grid_anchors(grid_sizes, strides)
+        anchors = torch.jit.annotate(List[List[torch.Tensor]], [])
+        # 遍历一个batch中的每张图像
+        for i, (image_height, image_width) in enumerate(image_list.image_sizes):
+            anchors_in_image = []
+            # 遍历每张预测特征图映射回原图的anchors坐标信息
+            for anchors_per_feature_map in anchors_over_all_feature_maps:
+                anchors_in_image.append(anchors_per_feature_map)
+            anchors.append(anchors_in_image)
+        # 将每一张图像的所有预测特征层的anchors坐标信息拼接在一起
+        # anchors是个list，每个元素为一张图像的所有anchors信息
+        anchors = [torch.cat(anchors_per_image) for anchors_per_image in anchors]
+        # Clear the cache in case that memory leaks.
+        self._cache.clear()
+        return anchors