From 6eb696918cf6a3cf2b29bddfd8c9cdb76529b4cc Mon Sep 17 00:00:00 2001 From: Donny You Date: Mon, 29 Jul 2019 23:59:21 +0800 Subject: [PATCH] fix ssd training. --- model/det/loss/multibox_loss.py | 2 +- model/det/nets/vgg16_ssd300.py | 1 + model/det/nets/vgg16_ssd512.py | 1 + runner/det/single_shot_detector.py | 13 ++++++++----- scripts/det/voc/run_ssd300_vgg16_voc_det.sh | 2 +- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/model/det/loss/multibox_loss.py b/model/det/loss/multibox_loss.py index ff2835f..0032789 100644 --- a/model/det/loss/multibox_loss.py +++ b/model/det/loss/multibox_loss.py @@ -68,7 +68,7 @@ def smooth_l1_loss(x, t): y = flag * (diff ** 2) * 0.5 + (1 - flag) * (abs_diff - 0.5) return y.sum() - def forward(self, loc_targets, conf_targets, loc_preds, conf_preds): + def forward(self, loc_preds, conf_preds, loc_targets, conf_targets): """Compute loss between (loc_preds, loc_targets) and (conf_preds, conf_targets). Args: diff --git a/model/det/nets/vgg16_ssd300.py b/model/det/nets/vgg16_ssd300.py index ccdb5f0..b49edb6 100755 --- a/model/det/nets/vgg16_ssd300.py +++ b/model/det/nets/vgg16_ssd300.py @@ -84,6 +84,7 @@ def vgg_backbone(configer): class Vgg16SSD300(nn.Module): def __init__(self, configer): super(Vgg16SSD300, self).__init__() + self.configer = configer self.backbone = vgg_backbone(configer).named_modules() cnt = 0 self.sub_backbone_1 = nn.ModuleList() diff --git a/model/det/nets/vgg16_ssd512.py b/model/det/nets/vgg16_ssd512.py index e6c4609..a8705fc 100644 --- a/model/det/nets/vgg16_ssd512.py +++ b/model/det/nets/vgg16_ssd512.py @@ -84,6 +84,7 @@ def vgg_backbone(configer): class Vgg16SSD512(nn.Module): def __init__(self, configer): super(Vgg16SSD512, self).__init__() + self.configer = configer self.backbone = vgg_backbone(configer).named_modules() cnt = 0 self.sub_backbone_1 = nn.ModuleList() diff --git a/runner/det/single_shot_detector.py b/runner/det/single_shot_detector.py index a72ed46..437ab35 100755 --- a/runner/det/single_shot_detector.py +++ b/runner/det/single_shot_detector.py @@ -44,6 +44,7 @@ def __init__(self, configer): self._init_model() def _init_model(self): + # torch.multiprocessing.set_sharing_strategy('file_system') self.det_net = self.det_model_manager.object_detector() self.det_net = RunnerHelper.load_net(self, self.det_net) self.optimizer, self.scheduler = Trainer.init(self._get_parameters(), self.configer.get('solver')) @@ -82,8 +83,9 @@ def train(self): solver_dict=self.configer.get('solver')) self.data_time.update(time.time() - start_time) # Forward pass. - out_dict = self.det_net(data_dict) - loss = out_dict['loss'].mean() + out = self.det_net(data_dict) + loss_dict = self.det_loss(out) + loss = loss_dict['loss'] self.train_losses.update(loss.item(), len(DCHelper.tolist(data_dict['meta']))) self.optimizer.zero_grad() @@ -127,9 +129,10 @@ def val(self): with torch.no_grad(): for j, data_dict in enumerate(self.val_loader): # Forward pass. - out_dict = self.det_net(data_dict) - - loss = out_dict['loss'].mean() + out = self.det_net(data_dict) + loss_dict = self.det_loss(out) + loss = loss_dict['loss'] + out_dict, _ = RunnerHelper.gather(self, out) # Compute the loss of the val batch. self.val_losses.update(loss.item(), len(DCHelper.tolist(data_dict['meta']))) diff --git a/scripts/det/voc/run_ssd300_vgg16_voc_det.sh b/scripts/det/voc/run_ssd300_vgg16_voc_det.sh index f373fbf..e50ea78 100644 --- a/scripts/det/voc/run_ssd300_vgg16_voc_det.sh +++ b/scripts/det/voc/run_ssd300_vgg16_voc_det.sh @@ -24,7 +24,7 @@ if [[ ! -d ${LOG_DIR} ]]; then fi if [[ "$1"x == "train"x ]]; then - ${PYTHON} -u main.py --config_file ${CONFIG_FILE} --phase train --log_to_file n --gpu 0 \ + ${PYTHON} -u main.py --config_file ${CONFIG_FILE} --phase train --gpu 0 --workers 3 \ --data_dir ${DATA_DIR} --loss_type ${LOSS_TYPE} --model_name ${MODEL_NAME} \ --checkpoints_name ${CHECKPOINTS_NAME} --pretrained ${PRETRAINED_MODEL} 2>&1 | tee ${LOG_FILE}