Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

strange detection result using the pretrained detection model on kitti-car #30

Open
hedes1992 opened this issue Aug 15, 2017 · 3 comments

Comments

@hedes1992
Copy link

I have just download the pretrained detection model which is released, and test it on the training dataset of kitti 2d-object detection.
At the beginning, I just modify the kitti's dataset directory in the rrc_test.py. But I get a bad result. The pr-curve is:
car_detection

When I see the detection result on the image ,I see the following things:
000020
000001
As you can see, the results is bilateral symmetry.
I suppose the coordinate postprocessing is wrong, So I just modify it to current code
`#!/usr/bin/env python
import numpy as np
import os, sys
import pickle
import timeit, Image, ImageDraw
from google.protobuf import text_format
caffe_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(file))))
print('caffe_root is {0}'.format(caffe_root))

CONFIDENCE_THRES = 0.2

kitti_detection_dataset_dir = '/home/hzw/projects/dataset/kitti/detection/training/image_2/'
model_store_dir = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/')
model_def_path = os.path.join(model_store_dir, 'deploy.prototxt')
model_weights_path = os.path.join(model_store_dir, 'VGG_KITTI_RRC_2560x768_kitti_4r4b_max_size_iter_60000.caffemodel')
voc_labelmap_file_path = os.path.join(caffe_root, 'data/KITTI-car/labelmap_voc.prototxt')
save_dir = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/')
txt_dir = os.path.join(caffe_root,
'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/conf-{0}/'.format(CONFIDENCE_THRES))

make sure the working directory is caffe_root

os.chdir(caffe_root)

add the caffe python module

sys.path.insert(0, 'python')
import caffe
from caffe.proto import caffe_pb2
from _ensemble import *

def get_labelname(labelmap, labels):
num_labels = len(labelmap.item)
labelnames = []
if type(labels) is not list:
labels = [labels]
for label in labels:
found = False
for i in xrange(0, num_labels):
if label == labelmap.item[i].label:
found = True
labelnames.append(labelmap.item[i].display_name)
break
assert found == True
return labelnames

def get_net_out(image, net, transformer):
"""forward the image and get the detection output"""
# print("processing image: {0}".format(img_path))
# image = caffe.io.loadimg(img_path)
transformed_image = transformer.preprocess('data', image)
net.blobs['data'].data[...] = transformed_image

# forward process
net_out     = net.forward()
return net_out

def parse_net_out(net_out_dict, voc_labelmap, img_size, detection_num=3, conf_thres=CONFIDENCE_THRES):
"""parse the detection output"""
img_h, img_w = img_size[0:2]
ensemble_num = 0
det_total = np.zeros([0, 6], float)

for out_name, out_val in net_out_dict.items():
    # if out_name == 'x_flip':
    #     continue
    net_out     = out_val
    for out_i in range(2, detection_num + 1):
        detection_i     = net_out['detection_out%d'%(out_i)].copy()
        # parse the output
        det_label_i     = detection_i[0, 0, :, 1]
        det_conf_i      = detection_i[0, 0, :, 2]
        # if out_name == 'orin':
        #     det_xmin_i      = detection_i[0, 0, :, 3]
        #     det_ymin_i      = detection_i[0, 0, :, 4]
        #     det_xmax_i      = detection_i[0, 0, :, 5]
        #     det_ymax_i      = detection_i[0, 0, :, 6]
        if True:#out_name == 'x_flip':
            det_xmin_i      = 1 - detection_i[0, 0, :, 5]
            det_ymin_i      = detection_i[0, 0, :, 4]
            det_xmax_i      = 1 - detection_i[0, 0, :, 3]
            det_ymax_i      = detection_i[0, 0, :, 6]
        # get detections with confidence higher than thres
        top_indices_i   = [j for j, conf in enumerate(det_conf_i) if conf >= conf_thres]
        if len(top_indices_i) == 0:
            det_this_i  = np.zeros([0, 6], float)
        else:
            top_conf_i  = det_conf_i[top_indices_i]
            top_labels_i     = det_label_i[top_indices_i]
            # top_labelnames_i= get_labelname(voc_labelmap, top_labels_i.tolist())
            top_xmin_i  = det_xmin_i[top_indices_i] * img_w
            top_ymin_i  = det_ymin_i[top_indices_i] * img_h
            top_xmax_i  = det_xmax_i[top_indices_i] * img_w
            top_ymax_i  = det_ymax_i[top_indices_i] * img_h

            det_this_i  = np.concatenate((top_xmin_i.reshape(-1, 1), top_ymin_i.reshape(-1, 1), top_xmax_i.reshape(-1, 1), top_ymax_i.reshape(-1, 1),
                                            top_conf_i.reshape(-1, 1), top_labels_i.reshape(-1, 1)), axis=1)
        
        det_total       = np.concatenate((det_total, det_this_i), axis=0)
        ensemble_num= ensemble_num + 1

# print("det_total: ")
# print(det_total)
det_results     = det_ensemble(det_total, ensemble_num)
return det_results

def transform_det_results_to_linelist(det_results, voc_labelmap, conf_thres=CONFIDENCE_THRES):
""""""
result_line_list = []

# print(det_results)
assert det_results.shape[0] > 0, "det_results.shape is: " + str(det_results.shape)

idxs    = np.where(det_results[:, 4] > conf_thres)[0]
top_xmin = det_results[idxs,0]
top_ymin = det_results[idxs,1]
top_xmax = det_results[idxs,2]
top_ymax = det_results[idxs,3]
top_conf = det_results[idxs,4]
top_label = det_results[idxs,5]
if top_label.shape[0] > 0:
    top_labelname   = get_labelname(voc_labelmap, top_label.tolist())
for i in range(top_conf.shape[0]):
    label= top_labelname[i]
    xmin = top_xmin[i]
    ymin = top_ymin[i]
    xmax = top_xmax[i]
    ymax = top_ymax[i]
    h = float(ymax - ymin)
    w = float(xmax - xmin)
    if (w==0) or (h==0):
       continue
    if (h/w >=2)and((xmin<10)or(xmax > 1230)):
       continue
    score = top_conf[i]
    result_line     = "%s -1 -1 -10 %.3f %.3f %3.f %3.f -1 -1 -1 -1000 -1000 -1000 -10 %.8f" % (label, xmin, ymin, xmax, ymax, score)
    result_line_list.append(result_line)
return result_line_list

if name == "main":
caffe.set_device(0)
caffe.set_mode_gpu()

STORE_OUTPUT        = False
PROCESS_OUTPUT      = True

# prepare the store addr
if not os.path.exists(txt_dir):
    os.makedirs(txt_dir)
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# read the labelmap configuration
with open(voc_labelmap_file_path, 'r') as fid:
    voc_labelmap            = caffe_pb2.LabelMap()
    text_format.Merge(str(fid.read()), voc_labelmap)


if STORE_OUTPUT:
    # construct the net
    net                         = caffe.Net(model_def_path,
                                            model_weights_path,
                                            caffe.TEST)
    # input preprocessing: 'data' is the name of the input blob == net.inputs[0]
    transformer                 = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
    transformer.set_transpose('data', (2, 0, 1))# HWC -> CHW
    transformer.set_raw_scale('data', 255) # the reference model operates on images in [0,255] range instead of [0,1]
    transformer.set_mean('data', np.array([104, 117, 123]))# mean pixel value
    transformer.set_channel_swap('data', (2,1,0)) # the reference model has channels in BGR order instead of RGB

    image_width, image_height, image_channel, batch_num     = 2560, 768, 3, 1
    net.blobs['data'].reshape(batch_num, image_channel, image_height, image_width)

if PROCESS_OUTPUT:
    write_img_result    = True

dataset_img_dir         = kitti_detection_dataset_dir
for img_name in sorted(os.listdir(dataset_img_dir)):
    img_path            = os.path.join(dataset_img_dir, img_name)
    if os.path.isfile(img_path):
        [img_index, img_ext]    = img_name.split(".")
        if img_ext == 'png':
            # the path of NN's output
            nn_output_result_path       = os.path.join(save_dir, img_index + '.pkl')
            img_number                  = int(img_index)
            if STORE_OUTPUT:
                print("Processing image: {0}".format(img_path))
                orin_img        = caffe.io.load_image(img_path)

                # # multiply 255.0 ,refer to http://www.cnblogs.com/jianyingzhou/p/4571454.html
                # orin_img        = orin_img * 255.0

                x_flip_img      = orin_img[:, ::-1, :]
                orin_net_out    = get_net_out(orin_img, net, transformer)
                x_flip_net_out  = get_net_out(x_flip_img, net, transformer)
                cur_net_out_dict            = {'orin': orin_net_out, 'x_flip': x_flip_net_out}

                store_dict      = {'output': cur_net_out_dict, 'img_shape': orin_img.shape[0:2]}
                # store output into pickle
                with open(nn_output_result_path, 'wb') as fid:
                    pickle.dump(store_dict, fid)
            if PROCESS_OUTPUT:
                if not STORE_OUTPUT:
                    # read output from pickle
                    with open(nn_output_result_path, 'rb') as fid:
                        store_dict  = pickle.load(fid)
                    print("Processing output: {0}".format(nn_output_result_path))
                cur_net_out_dict, img_shape     = store_dict['output'], store_dict['img_shape']
                cur_det_result      = parse_net_out(cur_net_out_dict, voc_labelmap, img_size=img_shape)
                if len(cur_det_result) > 0:
                    cur_result_line_list    = transform_det_results_to_linelist(cur_det_result, voc_labelmap)
                else:
                    cur_result_line_list    = []
                # print('cur_result_line_list is: ')
                # print(cur_result_line_list)

                if write_img_result:
                    img_txt_result_path     = os.path.join(txt_dir, img_index + '.txt')
                    with open(img_txt_result_path, 'w') as fid:
                        for line in cur_result_line_list:
                            fid.write(line + '\n')

`
And I find It works so well. For example,
000020 2
000014
000004 6
And the pr-curve is:
car_detection

And I just test this model on kitti-tracking's training set, the pr-curve is:
car_detection 2

@hedes1992
Copy link
Author

Sorry, the code is messy, the following is the whole test code

#!/usr/bin/env python
import numpy as np
import os, sys
import pickle
import timeit, Image, ImageDraw
from google.protobuf import text_format
caffe_root  = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
print('caffe_root is {0}'.format(caffe_root))

CONFIDENCE_THRES                = 0.2

kitti_detection_dataset_dir      = '/home/hzw/projects/dataset/kitti/detection/training/image_2/'
model_store_dir                 = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/')
model_def_path                  = os.path.join(model_store_dir, 'deploy.prototxt')
model_weights_path              = os.path.join(model_store_dir, 'VGG_KITTI_RRC_2560x768_kitti_4r4b_max_size_iter_60000.caffemodel')
voc_labelmap_file_path          = os.path.join(caffe_root, 'data/KITTI-car/labelmap_voc.prototxt')
save_dir                        = os.path.join(caffe_root, 'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/')
txt_dir                         = os.path.join(caffe_root, \
    'models/VGGNet/KITTI/RRC_2560x768_kitti_4r4b_max_size/result-test-kitti_detection_training/conf-{0}/'.format(CONFIDENCE_THRES))

# make sure the working directory is caffe_root
os.chdir(caffe_root)
# add the caffe python module
sys.path.insert(0, 'python')
import caffe
from caffe.proto import caffe_pb2
from _ensemble import *

def get_labelname(labelmap, labels):
    num_labels  = len(labelmap.item)
    labelnames  = []
    if type(labels) is not list:
        labels  = [labels]
    for label in labels:
        found   = False
        for i in xrange(0, num_labels):
            if label == labelmap.item[i].label:
                found   = True
                labelnames.append(labelmap.item[i].display_name)
                break
        assert found == True
    return labelnames

def get_net_out(image, net, transformer):
    """forward the image and get the detection output"""
    # print("processing image: {0}".format(img_path))
    # image       = caffe.io.loadimg(img_path)
    transformed_image   = transformer.preprocess('data', image)
    net.blobs['data'].data[...]  = transformed_image

    # forward process
    net_out     = net.forward()
    return net_out

def parse_net_out(net_out_dict, voc_labelmap, img_size, detection_num=3, conf_thres=CONFIDENCE_THRES):
    """parse the detection output"""
    img_h, img_w    = img_size[0:2]
    ensemble_num    = 0
    det_total       = np.zeros([0, 6], float)

    for out_name, out_val in net_out_dict.items():
        # if out_name == 'x_flip':
        #     continue
        net_out     = out_val
        for out_i in range(2, detection_num + 1):
            detection_i     = net_out['detection_out%d'%(out_i)].copy()
            # parse the output
            det_label_i     = detection_i[0, 0, :, 1]
            det_conf_i      = detection_i[0, 0, :, 2]
            # if out_name == 'orin':
            #     det_xmin_i      = detection_i[0, 0, :, 3]
            #     det_ymin_i      = detection_i[0, 0, :, 4]
            #     det_xmax_i      = detection_i[0, 0, :, 5]
            #     det_ymax_i      = detection_i[0, 0, :, 6]
            if True:#out_name == 'x_flip':
                det_xmin_i      = 1 - detection_i[0, 0, :, 5]
                det_ymin_i      = detection_i[0, 0, :, 4]
                det_xmax_i      = 1 - detection_i[0, 0, :, 3]
                det_ymax_i      = detection_i[0, 0, :, 6]
            # get detections with confidence higher than thres
            top_indices_i   = [j for j, conf in enumerate(det_conf_i) if conf >= conf_thres]
            if len(top_indices_i) == 0:
                det_this_i  = np.zeros([0, 6], float)
            else:
                top_conf_i  = det_conf_i[top_indices_i]
                top_labels_i     = det_label_i[top_indices_i]
                # top_labelnames_i= get_labelname(voc_labelmap, top_labels_i.tolist())
                top_xmin_i  = det_xmin_i[top_indices_i] * img_w
                top_ymin_i  = det_ymin_i[top_indices_i] * img_h
                top_xmax_i  = det_xmax_i[top_indices_i] * img_w
                top_ymax_i  = det_ymax_i[top_indices_i] * img_h

                det_this_i  = np.concatenate((top_xmin_i.reshape(-1, 1), top_ymin_i.reshape(-1, 1), top_xmax_i.reshape(-1, 1), top_ymax_i.reshape(-1, 1),
                                                top_conf_i.reshape(-1, 1), top_labels_i.reshape(-1, 1)), axis=1)
            
            det_total       = np.concatenate((det_total, det_this_i), axis=0)
            ensemble_num= ensemble_num + 1

    # print("det_total: ")
    # print(det_total)
    det_results     = det_ensemble(det_total, ensemble_num)
    return det_results

def transform_det_results_to_linelist(det_results, voc_labelmap, conf_thres=CONFIDENCE_THRES):
    """"""
    result_line_list        = []

    # print(det_results)
    assert det_results.shape[0] > 0, "det_results.shape is: " + str(det_results.shape)

    idxs    = np.where(det_results[:, 4] > conf_thres)[0]
    top_xmin = det_results[idxs,0]
    top_ymin = det_results[idxs,1]
    top_xmax = det_results[idxs,2]
    top_ymax = det_results[idxs,3]
    top_conf = det_results[idxs,4]
    top_label = det_results[idxs,5]
    if top_label.shape[0] > 0:
        top_labelname   = get_labelname(voc_labelmap, top_label.tolist())
    for i in range(top_conf.shape[0]):
        label= top_labelname[i]
        xmin = top_xmin[i]
        ymin = top_ymin[i]
        xmax = top_xmax[i]
        ymax = top_ymax[i]
        h = float(ymax - ymin)
        w = float(xmax - xmin)
        if (w==0) or (h==0):
           continue
        if (h/w >=2)and((xmin<10)or(xmax > 1230)):
           continue
        score = top_conf[i]
        result_line     = "%s -1 -1 -10 %.3f %.3f %3.f %3.f -1 -1 -1 -1000 -1000 -1000 -10 %.8f" % (label, xmin, ymin, xmax, ymax, score)
        result_line_list.append(result_line)
    return result_line_list


if __name__ == "__main__":
    caffe.set_device(0)
    caffe.set_mode_gpu()

    STORE_OUTPUT        = False
    PROCESS_OUTPUT      = True

    # prepare the store addr
    if not os.path.exists(txt_dir):
        os.makedirs(txt_dir)
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    # read the labelmap configuration
    with open(voc_labelmap_file_path, 'r') as fid:
        voc_labelmap            = caffe_pb2.LabelMap()
        text_format.Merge(str(fid.read()), voc_labelmap)


    if STORE_OUTPUT:
        # construct the net
        net                         = caffe.Net(model_def_path,
                                                model_weights_path,
                                                caffe.TEST)
        # input preprocessing: 'data' is the name of the input blob == net.inputs[0]
        transformer                 = caffe.io.Transformer({'data': net.blobs['data'].data.shape})
        transformer.set_transpose('data', (2, 0, 1))# HWC -> CHW
        transformer.set_raw_scale('data', 255) # the reference model operates on images in [0,255] range instead of [0,1]
        transformer.set_mean('data', np.array([104, 117, 123]))# mean pixel value
        transformer.set_channel_swap('data', (2,1,0)) # the reference model has channels in BGR order instead of RGB

        image_width, image_height, image_channel, batch_num     = 2560, 768, 3, 1
        net.blobs['data'].reshape(batch_num, image_channel, image_height, image_width)

    if PROCESS_OUTPUT:
        write_img_result    = True

    dataset_img_dir         = kitti_detection_dataset_dir
    for img_name in sorted(os.listdir(dataset_img_dir)):
        img_path            = os.path.join(dataset_img_dir, img_name)
        if os.path.isfile(img_path):
            [img_index, img_ext]    = img_name.split(".")
            if img_ext == 'png':
                # the path of NN's output
                nn_output_result_path       = os.path.join(save_dir, img_index + '.pkl')
                img_number                  = int(img_index)
                if STORE_OUTPUT:
                    print("Processing image: {0}".format(img_path))
                    orin_img        = caffe.io.load_image(img_path)

                    # # multiply 255.0 ,refer to http://www.cnblogs.com/jianyingzhou/p/4571454.html
                    # orin_img        = orin_img * 255.0

                    x_flip_img      = orin_img[:, ::-1, :]
                    orin_net_out    = get_net_out(orin_img, net, transformer)
                    x_flip_net_out  = get_net_out(x_flip_img, net, transformer)
                    cur_net_out_dict            = {'orin': orin_net_out, 'x_flip': x_flip_net_out}

                    store_dict      = {'output': cur_net_out_dict, 'img_shape': orin_img.shape[0:2]}
                    # store output into pickle
                    with open(nn_output_result_path, 'wb') as fid:
                        pickle.dump(store_dict, fid)
                if PROCESS_OUTPUT:
                    if not STORE_OUTPUT:
                        # read output from pickle
                        with open(nn_output_result_path, 'rb') as fid:
                            store_dict  = pickle.load(fid)
                        print("Processing output: {0}".format(nn_output_result_path))
                    cur_net_out_dict, img_shape     = store_dict['output'], store_dict['img_shape']
                    cur_det_result      = parse_net_out(cur_net_out_dict, voc_labelmap, img_size=img_shape)
                    if len(cur_det_result) > 0:
                        cur_result_line_list    = transform_det_results_to_linelist(cur_det_result, voc_labelmap)
                    else:
                        cur_result_line_list    = []
                    # print('cur_result_line_list is: ')
                    # print(cur_result_line_list)

                    if write_img_result:
                        img_txt_result_path     = os.path.join(txt_dir, img_index + '.txt')
                        with open(img_txt_result_path, 'w') as fid:
                            for line in cur_result_line_list:
                                fid.write(line + '\n')

And I do not know why I do the same operation

det_xmin_i      = 1 - detection_i[0, 0, :, 5]
det_ymin_i      = detection_i[0, 0, :, 4]
det_xmax_i      = 1 - detection_i[0, 0, :, 3]
det_ymax_i      = detection_i[0, 0, :, 6]

for original image's output and horizontally-flipped image's output, and get the good result. It's so strange

@ZhihongChen123
Copy link

Hello, I would like to ask you about the test code, how do I see the test results on the image?Thank you very much!

@hedes1992
Copy link
Author

@ZhihongChen123 The code for testing pretrained model is listed at 2-nd comment.The evaluation code is from kitti benchmark.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants