strange detection result using the pretrained detection model on kitti-car #30

hedes1992 · 2017-08-15T09:52:11Z

I have just download the pretrained detection model which is released, and test it on the training dataset of kitti 2d-object detection.
At the beginning, I just modify the kitti's dataset directory in the rrc_test.py. But I get a bad result. The pr-curve is:

When I see the detection result on the image ,I see the following things:

As you can see, the results is bilateral symmetry.
I suppose the coordinate postprocessing is wrong, So I just modify it to current code
`#!/usr/bin/env python
import numpy as np
import os, sys
import pickle
import timeit, Image, ImageDraw
from google.protobuf import text_format
caffe_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(file))))
print('caffe_root is {0}'.format(caffe_root))

CONFIDENCE_THRES = 0.2

kitti_detection_dataset_dir = '/home/hzw/projects/dataset/kitti/detection/training/image_2/'
model_store_dir = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/')
model_def_path = os.path.join(model_store_dir, 'deploy.prototxt')
model_weights_path = os.path.join(model_store_dir, 'VGG_KITTI_RRC_2560x768_kitti_4r4b_max_size_iter_60000.caffemodel')
voc_labelmap_file_path = os.path.join(caffe_root, 'data/KITTI-car/labelmap_voc.prototxt')
save_dir = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/')
txt_dir = os.path.join(caffe_root,
'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/conf-{0}/'.format(CONFIDENCE_THRES))

make sure the working directory is caffe_root

os.chdir(caffe_root)

add the caffe python module

sys.path.insert(0, 'python')
import caffe
from caffe.proto import caffe_pb2
from _ensemble import *

def get_labelname(labelmap, labels):
num_labels = len(labelmap.item)
labelnames = []
if type(labels) is not list:
labels = [labels]
for label in labels:
found = False
for i in xrange(0, num_labels):
if label == labelmap.item[i].label:
found = True
labelnames.append(labelmap.item[i].display_name)
break
assert found == True
return labelnames

def get_net_out(image, net, transformer):
"""forward the image and get the detection output"""
# print("processing image: {0}".format(img_path))
# image = caffe.io.loadimg(img_path)
transformed_image = transformer.preprocess('data', image)
net.blobs['data'].data[...] = transformed_image

# forward process
net_out     = net.forward()
return net_out

def parse_net_out(net_out_dict, voc_labelmap, img_size, detection_num=3, conf_thres=CONFIDENCE_THRES):
"""parse the detection output"""
img_h, img_w = img_size[0:2]
ensemble_num = 0
det_total = np.zeros([0, 6], float)

for out_name, out_val in net_out_dict.items():
    # if out_name == 'x_flip':
    #     continue
    net_out     = out_val
    for out_i in range(2, detection_num + 1):
        detection_i     = net_out['detection_out%d'%(out_i)].copy()
        # parse the output
        det_label_i     = detection_i[0, 0, :, 1]
        det_conf_i      = detection_i[0, 0, :, 2]
        # if out_name == 'orin':
        #     det_xmin_i      = detection_i[0, 0, :, 3]
        #     det_ymin_i      = detection_i[0, 0, :, 4]
        #     det_xmax_i      = detection_i[0, 0, :, 5]
        #     det_ymax_i      = detection_i[0, 0, :, 6]
        if True:#out_name == 'x_flip':
            det_xmin_i      = 1 - detection_i[0, 0, :, 5]
            det_ymin_i      = detection_i[0, 0, :, 4]
            det_xmax_i      = 1 - detection_i[0, 0, :, 3]
            det_ymax_i      = detection_i[0, 0, :, 6]
        # get detections with confidence higher than thres
        top_indices_i   = [j for j, conf in enumerate(det_conf_i) if conf >= conf_thres]
        if len(top_indices_i) == 0:
            det_this_i  = np.zeros([0, 6], float)
        else:
            top_conf_i  = det_conf_i[top_indices_i]
            top_labels_i     = det_label_i[top_indices_i]
            # top_labelnames_i= get_labelname(voc_labelmap, top_labels_i.tolist())
            top_xmin_i  = det_xmin_i[top_indices_i] * img_w
            top_ymin_i  = det_ymin_i[top_indices_i] * img_h
            top_xmax_i  = det_xmax_i[top_indices_i] * img_w
            top_ymax_i  = det_ymax_i[top_indices_i] * img_h

            det_this_i  = np.concatenate((top_xmin_i.reshape(-1, 1), top_ymin_i.reshape(-1, 1), top_xmax_i.reshape(-1, 1), top_ymax_i.reshape(-1, 1),
                                            top_conf_i.reshape(-1, 1), top_labels_i.reshape(-1, 1)), axis=1)
        
        det_total       = np.concatenate((det_total, det_this_i), axis=0)
        ensemble_num= ensemble_num + 1

# print("det_total: ")
# print(det_total)
det_results     = det_ensemble(det_total, ensemble_num)
return det_results

def transform_det_results_to_linelist(det_results, voc_labelmap, conf_thres=CONFIDENCE_THRES):
""""""
result_line_list = []

# print(det_results)
assert det_results.shape[0] > 0, "det_results.shape is: " + str(det_results.shape)

idxs    = np.where(det_results[:, 4] > conf_thres)[0]
top_xmin = det_results[idxs,0]
top_ymin = det_results[idxs,1]
top_xmax = det_results[idxs,2]
top_ymax = det_results[idxs,3]
top_conf = det_results[idxs,4]
top_label = det_results[idxs,5]
if top_label.shape[0] > 0:
    top_labelname   = get_labelname(voc_labelmap, top_label.tolist())
for i in range(top_conf.shape[0]):
    label= top_labelname[i]
    xmin = top_xmin[i]
    ymin = top_ymin[i]
    xmax = top_xmax[i]
    ymax = top_ymax[i]
    h = float(ymax - ymin)
    w = float(xmax - xmin)
    if (w==0) or (h==0):
       continue
    if (h/w >=2)and((xmin<10)or(xmax > 1230)):
       continue
    score = top_conf[i]
    result_line     = "%s -1 -1 -10 %.3f %.3f %3.f %3.f -1 -1 -1 -1000 -1000 -1000 -10 %.8f" % (label, xmin, ymin, xmax, ymax, score)
    result_line_list.append(result_line)
return result_line_list

if name == "main":
caffe.set_device(0)
caffe.set_mode_gpu()

STORE_OUTPUT        = False
PROCESS_OUTPUT      = True

# prepare the store addr
if not os.path.exists(txt_dir):
    os.makedirs(txt_dir)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# read the labelmap configuration
with open(voc_labelmap_file_path, 'r') as fid:
    voc_labelmap            = caffe_pb2.LabelMap()
    text_format.Merge(str(fid.read()), voc_labelmap)


if STORE_OUTPUT:
    # construct the net
    net                         = caffe.Net(model_def_path,
                                            model_weights_path,
                                            caffe.TEST)
    # input preprocessing: 'data' is the name of the input blob == net.inputs[0]
    transformer                 = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
    transformer.set_transpose('data', (2, 0, 1))# HWC -> CHW
    transformer.set_raw_scale('data', 255) # the reference model operates on images in [0,255] range instead of [0,1]
    transformer.set_mean('data', np.array([104, 117, 123]))# mean pixel value
    transformer.set_channel_swap('data', (2,1,0)) # the reference model has channels in BGR order instead of RGB

    image_width, image_height, image_channel, batch_num     = 2560, 768, 3, 1
    net.blobs['data'].reshape(batch_num, image_channel, image_height, image_width)

if PROCESS_OUTPUT:
    write_img_result    = True

dataset_img_dir         = kitti_detection_dataset_dir
for img_name in sorted(os.listdir(dataset_img_dir)):
    img_path            = os.path.join(dataset_img_dir, img_name)
    if os.path.isfile(img_path):
        [img_index, img_ext]    = img_name.split(".")
        if img_ext == 'png':
            # the path of NN's output
            nn_output_result_path       = os.path.join(save_dir, img_index + '.pkl')
            img_number                  = int(img_index)
            if STORE_OUTPUT:
                print("Processing image: {0}".format(img_path))
                orin_img        = caffe.io.load_image(img_path)

                # # multiply 255.0 ,refer to http://www.cnblogs.com/jianyingzhou/p/4571454.html
                # orin_img        = orin_img * 255.0

                x_flip_img      = orin_img[:, ::-1, :]
                orin_net_out    = get_net_out(orin_img, net, transformer)
                x_flip_net_out  = get_net_out(x_flip_img, net, transformer)
                cur_net_out_dict            = {'orin': orin_net_out, 'x_flip': x_flip_net_out}

                store_dict      = {'output': cur_net_out_dict, 'img_shape': orin_img.shape[0:2]}
                # store output into pickle
                with open(nn_output_result_path, 'wb') as fid:
                    pickle.dump(store_dict, fid)
            if PROCESS_OUTPUT:
                if not STORE_OUTPUT:
                    # read output from pickle
                    with open(nn_output_result_path, 'rb') as fid:
                        store_dict  = pickle.load(fid)
                    print("Processing output: {0}".format(nn_output_result_path))
                cur_net_out_dict, img_shape     = store_dict['output'], store_dict['img_shape']
                cur_det_result      = parse_net_out(cur_net_out_dict, voc_labelmap, img_size=img_shape)
                if len(cur_det_result) > 0:
                    cur_result_line_list    = transform_det_results_to_linelist(cur_det_result, voc_labelmap)
                else:
                    cur_result_line_list    = []
                # print('cur_result_line_list is: ')
                # print(cur_result_line_list)

                if write_img_result:
                    img_txt_result_path     = os.path.join(txt_dir, img_index + '.txt')
                    with open(img_txt_result_path, 'w') as fid:
                        for line in cur_result_line_list:
                            fid.write(line + '\n')

`
And I find It works so well. For example,

And the pr-curve is:

And I just test this model on kitti-tracking's training set, the pr-curve is:

The text was updated successfully, but these errors were encountered:

hedes1992 · 2017-08-15T09:56:58Z

Sorry, the code is messy, the following is the whole test code

#!/usr/bin/env python
import numpy as np
import os, sys
import pickle
import timeit, Image, ImageDraw
from google.protobuf import text_format
caffe_root  = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
print('caffe_root is {0}'.format(caffe_root))

CONFIDENCE_THRES                = 0.2

kitti_detection_dataset_dir      = '/home/hzw/projects/dataset/kitti/detection/training/image_2/'
model_store_dir                 = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/')
model_def_path                  = os.path.join(model_store_dir, 'deploy.prototxt')
model_weights_path              = os.path.join(model_store_dir, 'VGG_KITTI_RRC_2560x768_kitti_4r4b_max_size_iter_60000.caffemodel')
voc_labelmap_file_path          = os.path.join(caffe_root, 'data/KITTI-car/labelmap_voc.prototxt')
save_dir                        = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/')
txt_dir                         = os.path.join(caffe_root, \
    'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/conf-{0}/'.format(CONFIDENCE_THRES))

# make sure the working directory is caffe_root
os.chdir(caffe_root)
# add the caffe python module
sys.path.insert(0, 'python')
import caffe
from caffe.proto import caffe_pb2
from _ensemble import *

def get_labelname(labelmap, labels):
    num_labels  = len(labelmap.item)
    labelnames  = []
    if type(labels) is not list:
        labels  = [labels]
    for label in labels:
        found   = False
        for i in xrange(0, num_labels):
            if label == labelmap.item[i].label:
                found   = True
                labelnames.append(labelmap.item[i].display_name)
                break
        assert found == True
    return labelnames

def get_net_out(image, net, transformer):
    """forward the image and get the detection output"""
    # print("processing image: {0}".format(img_path))
    # image       = caffe.io.loadimg(img_path)
    transformed_image   = transformer.preprocess('data', image)
    net.blobs['data'].data[...]  = transformed_image

    # forward process
    net_out     = net.forward()
    return net_out

def parse_net_out(net_out_dict, voc_labelmap, img_size, detection_num=3, conf_thres=CONFIDENCE_THRES):
    """parse the detection output"""
    img_h, img_w    = img_size[0:2]
    ensemble_num    = 0
    det_total       = np.zeros([0, 6], float)

    for out_name, out_val in net_out_dict.items():
        # if out_name == 'x_flip':
        #     continue
        net_out     = out_val
        for out_i in range(2, detection_num + 1):
            detection_i     = net_out['detection_out%d'%(out_i)].copy()
            # parse the output
            det_label_i     = detection_i[0, 0, :, 1]
            det_conf_i      = detection_i[0, 0, :, 2]
            # if out_name == 'orin':
            #     det_xmin_i      = detection_i[0, 0, :, 3]
            #     det_ymin_i      = detection_i[0, 0, :, 4]
            #     det_xmax_i      = detection_i[0, 0, :, 5]
            #     det_ymax_i      = detection_i[0, 0, :, 6]
            if True:#out_name == 'x_flip':
                det_xmin_i      = 1 - detection_i[0, 0, :, 5]
                det_ymin_i      = detection_i[0, 0, :, 4]
                det_xmax_i      = 1 - detection_i[0, 0, :, 3]
                det_ymax_i      = detection_i[0, 0, :, 6]
            # get detections with confidence higher than thres
            top_indices_i   = [j for j, conf in enumerate(det_conf_i) if conf >= conf_thres]
            if len(top_indices_i) == 0:
                det_this_i  = np.zeros([0, 6], float)
            else:
                top_conf_i  = det_conf_i[top_indices_i]
                top_labels_i     = det_label_i[top_indices_i]
                # top_labelnames_i= get_labelname(voc_labelmap, top_labels_i.tolist())
                top_xmin_i  = det_xmin_i[top_indices_i] * img_w
                top_ymin_i  = det_ymin_i[top_indices_i] * img_h
                top_xmax_i  = det_xmax_i[top_indices_i] * img_w
                top_ymax_i  = det_ymax_i[top_indices_i] * img_h

                det_this_i  = np.concatenate((top_xmin_i.reshape(-1, 1), top_ymin_i.reshape(-1, 1), top_xmax_i.reshape(-1, 1), top_ymax_i.reshape(-1, 1),
                                                top_conf_i.reshape(-1, 1), top_labels_i.reshape(-1, 1)), axis=1)
            
            det_total       = np.concatenate((det_total, det_this_i), axis=0)
            ensemble_num= ensemble_num + 1

    # print("det_total: ")
    # print(det_total)
    det_results     = det_ensemble(det_total, ensemble_num)
    return det_results

def transform_det_results_to_linelist(det_results, voc_labelmap, conf_thres=CONFIDENCE_THRES):
    """"""
    result_line_list        = []

    # print(det_results)
    assert det_results.shape[0] > 0, "det_results.shape is: " + str(det_results.shape)

    idxs    = np.where(det_results[:, 4] > conf_thres)[0]
    top_xmin = det_results[idxs,0]
    top_ymin = det_results[idxs,1]
    top_xmax = det_results[idxs,2]
    top_ymax = det_results[idxs,3]
    top_conf = det_results[idxs,4]
    top_label = det_results[idxs,5]
    if top_label.shape[0] > 0:
        top_labelname   = get_labelname(voc_labelmap, top_label.tolist())
    for i in range(top_conf.shape[0]):
        label= top_labelname[i]
        xmin = top_xmin[i]
        ymin = top_ymin[i]
        xmax = top_xmax[i]
        ymax = top_ymax[i]
        h = float(ymax - ymin)
        w = float(xmax - xmin)
        if (w==0) or (h==0):
           continue
        if (h/w >=2)and((xmin<10)or(xmax > 1230)):
           continue
        score = top_conf[i]
        result_line     = "%s -1 -1 -10 %.3f %.3f %3.f %3.f -1 -1 -1 -1000 -1000 -1000 -10 %.8f" % (label, xmin, ymin, xmax, ymax, score)
        result_line_list.append(result_line)
    return result_line_list


if __name__ == "__main__":
    caffe.set_device(0)
    caffe.set_mode_gpu()

    STORE_OUTPUT        = False
    PROCESS_OUTPUT      = True

    # prepare the store addr
    if not os.path.exists(txt_dir):
        os.makedirs(txt_dir)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # read the labelmap configuration
    with open(voc_labelmap_file_path, 'r') as fid:
        voc_labelmap            = caffe_pb2.LabelMap()
        text_format.Merge(str(fid.read()), voc_labelmap)


    if STORE_OUTPUT:
        # construct the net
        net                         = caffe.Net(model_def_path,
                                                model_weights_path,
                                                caffe.TEST)
        # input preprocessing: 'data' is the name of the input blob == net.inputs[0]
        transformer                 = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
        transformer.set_transpose('data', (2, 0, 1))# HWC -> CHW
        transformer.set_raw_scale('data', 255) # the reference model operates on images in [0,255] range instead of [0,1]
        transformer.set_mean('data', np.array([104, 117, 123]))# mean pixel value
        transformer.set_channel_swap('data', (2,1,0)) # the reference model has channels in BGR order instead of RGB

        image_width, image_height, image_channel, batch_num     = 2560, 768, 3, 1
        net.blobs['data'].reshape(batch_num, image_channel, image_height, image_width)

    if PROCESS_OUTPUT:
        write_img_result    = True

    dataset_img_dir         = kitti_detection_dataset_dir
    for img_name in sorted(os.listdir(dataset_img_dir)):
        img_path            = os.path.join(dataset_img_dir, img_name)
        if os.path.isfile(img_path):
            [img_index, img_ext]    = img_name.split(".")
            if img_ext == 'png':
                # the path of NN's output
                nn_output_result_path       = os.path.join(save_dir, img_index + '.pkl')
                img_number                  = int(img_index)
                if STORE_OUTPUT:
                    print("Processing image: {0}".format(img_path))
                    orin_img        = caffe.io.load_image(img_path)

                    # # multiply 255.0 ,refer to http://www.cnblogs.com/jianyingzhou/p/4571454.html
                    # orin_img        = orin_img * 255.0

                    x_flip_img      = orin_img[:, ::-1, :]
                    orin_net_out    = get_net_out(orin_img, net, transformer)
                    x_flip_net_out  = get_net_out(x_flip_img, net, transformer)
                    cur_net_out_dict            = {'orin': orin_net_out, 'x_flip': x_flip_net_out}

                    store_dict      = {'output': cur_net_out_dict, 'img_shape': orin_img.shape[0:2]}
                    # store output into pickle
                    with open(nn_output_result_path, 'wb') as fid:
                        pickle.dump(store_dict, fid)
                if PROCESS_OUTPUT:
                    if not STORE_OUTPUT:
                        # read output from pickle
                        with open(nn_output_result_path, 'rb') as fid:
                            store_dict  = pickle.load(fid)
                        print("Processing output: {0}".format(nn_output_result_path))
                    cur_net_out_dict, img_shape     = store_dict['output'], store_dict['img_shape']
                    cur_det_result      = parse_net_out(cur_net_out_dict, voc_labelmap, img_size=img_shape)
                    if len(cur_det_result) > 0:
                        cur_result_line_list    = transform_det_results_to_linelist(cur_det_result, voc_labelmap)
                    else:
                        cur_result_line_list    = []
                    # print('cur_result_line_list is: ')
                    # print(cur_result_line_list)

                    if write_img_result:
                        img_txt_result_path     = os.path.join(txt_dir, img_index + '.txt')
                        with open(img_txt_result_path, 'w') as fid:
                            for line in cur_result_line_list:
                                fid.write(line + '\n')

And I do not know why I do the same operation

det_xmin_i      = 1 - detection_i[0, 0, :, 5]
det_ymin_i      = detection_i[0, 0, :, 4]
det_xmax_i      = 1 - detection_i[0, 0, :, 3]
det_ymax_i      = detection_i[0, 0, :, 6]

for original image's output and horizontally-flipped image's output, and get the good result. It's so strange

ZhihongChen123 · 2017-10-31T02:11:03Z

Hello, I would like to ask you about the test code, how do I see the test results on the image？Thank you very much!

hedes1992 · 2017-10-31T03:20:49Z

@ZhihongChen123 The code for testing pretrained model is listed at 2-nd comment.The evaluation code is from kitti benchmark.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

strange detection result using the pretrained detection model on kitti-car #30

strange detection result using the pretrained detection model on kitti-car #30

hedes1992 commented Aug 15, 2017

hedes1992 commented Aug 15, 2017

ZhihongChen123 commented Oct 31, 2017

hedes1992 commented Oct 31, 2017

strange detection result using the pretrained detection model on kitti-car #30

strange detection result using the pretrained detection model on kitti-car #30

Comments

hedes1992 commented Aug 15, 2017

make sure the working directory is caffe_root

add the caffe python module

hedes1992 commented Aug 15, 2017

ZhihongChen123 commented Oct 31, 2017

hedes1992 commented Oct 31, 2017