diff --git a/README.md b/README.md index 870794ba6..69aa075f7 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,99 @@ -This code is currently in the process of being cleaned up and made presentable. The code has been used for the bachelor thesis of Kerim Birgi under supervision of Jonas Hein and Prof. Dr. Marc Pollefeys in the Computer Vision and Geometry Lab at ETH Zurich - -# Losses Key: -- B: Box Localization Loss -- C: Class Confidence Loss -- M: Mask Loss -- P: Prototype Loss -- D: Coefficient Diversity Loss -- E: Class Existence Loss -- S: Semantic Segmentation Loss +The original docs of the yolact repo contains some informations that I might not list here, hence if anything is unclear it might be worth looking there and reading the original readme file. You can access it [here](https://github.com/dbolya/yolact). + +# Recreating Base model Results from the Thesis +Note that all evaluation scripts merely provide the predictions of the yolact model. The predictions are evaluated afterwards in the [BachelorsThesis](https://github.com/KGB99/BachelorThesis) repo. +Essentially both training and evaluation is running a script on the cluster respectively, hence I will name the scripts that are linked to the presented models in the paper. There is of course a lot more scripts than just these due to experiments not presented in the paper. Further below I provide an explanation of everything in more thorough detail. +The train scripts are all in the train_scripts folder, except for Yolact-Mixed, thats in the refinement_scripts folder, and the eval scripts are all in the eval_scripts folder. + +| Model name | Train script | Eval script | +| ---------- | ------------ | ----------- | +| Yolact-Pbr | pbr_random_kinect_resnet50.sh | eval_pbr_30000.sh | +| Yolact-Real| ssd_amodal_resnet50_40000.sh | eval_ssd_amodal.sh | +| Yolact-Augmented | pbr_noise_hue_random_kinect_resnet50.sh | eval_pbr_augmented_30000.sh | +| Yolact-Mixed | refine_all_no_noise.sh | eval_real_no_noise_27000.sh | + +# Recreating Semi-Supervised model results from the Thesis +The scripts in the mask_scripts folder create the pseudo ground-truth annotations. Following that the models have to be trained with the scripts in the supervised_training_scripts folder. +sampling_stride_10_pbr_base.sh is to create masks with the yolact-pbr model. +sampling_refined_all_27000.sh is to create masks with the yolact-mixed model. +For evaluating use scripts in the eval_scripts folder, namely eval_unsupervised_base_33000.sh for the yolact-pbr model and eval_unsupervised_real_33000.sh for the yolact-mixed model. + + +# Training Yolact with a custom dataset +The code in this repo is adapted for using the dataset from [Hein et al.](https://arxiv.org/abs/2305.03535). +The original repo does a good explanation of explaining how to train on your own dataset, but it should be clear from what is written here as well. + +### Command line training +The relevant packages and pip installs required can be found in the requirements.txt file. +In order to train yolact, we use the given train.py file and begin the training from the command line. When training from the starting weights the following command can be used: + +``` +python3 train.py --batch_size=8 --num_workers=1 --config=your_config_name +``` + +### ETH Cluster job submisison training +Due to the requirement of a gpu, I ran all of my trainings on the cluster. +To submit it as a slurm job many examples are provided in the train_scripts folder. The modules gcc/8.2.0,python_gpu/3.11.2, and eth_proxy are enough to run train.py, the eth_proxy is only used for wandb and can be excluded otherwise. Furthermore, for the image augmentation during training you need to use a virtual env with imgaug and imgaug.corruptors packages, if you do not wish to do the stronger augmentations you can skip the virtual python environment lines (namely source... and deactivate at the end). +To provide an example, here is the script for the Yolact_pbr model from my thesis: + +``` +#!/bin/bash + +#SBATCH -n 1 +#SBATCH --gpus=1 +#SBATCH --mem-per-cpu=80G +#SBATCH --time=48:00:00 + +module load gcc/8.2.0 python_gpu/3.11.2 eth_proxy +source myenv/bin/activate +python3 train.py --batch_size=8 --num_workers=1 --config=train_pbr_random_and_kinect_hue_40000 +deactivate +``` +## Config file and Annotations +Note that the config name in the above commands needs to be adapted to your own configs name, which you must define in data/config.py. The MEDICAL_CLASSES include the powerdrill and screwdriver. +To do this, copying the base data config and replacing the following infos is enough: + +``` +your_config = dataset_base.copy({ + 'name' : 'your_config_name', + 'train_images' : '/location/to/your/training/images', + 'train_info' : '/location/to/your/training/json/file', + 'valid_images' : '/location/to/your/validation/images', + 'valid_info' : '/location/to/your/validation/json/file', + 'has_gt' : True, + 'class_names' : MEDICAL_CLASSES, + 'label_map' : None +}) +``` + +Yolact reads its training data from a JSON file in the MS-COCO format. These are the "train_info" and valid_info" files you provide in the config. The format of these JSON's is as follows: + +``` +{ + "info": {"description" : "your datasets description"}, + "licenses": {}, + "images": [ + { + "file_name": "image.jpg", + "height": image height, + "width": image width, + "id": image id + }, + ... + ], + "annotations": [ + { + "segmentation": segmentation vertices, + "area": area of segmentation, + "iscrowd": 0, + "image_id": id of corresponding image in the "images", + "bbox": bbox vertices, + "category_id": category id of the annotation, + "id": annotation id + }, + ... + ] +} +``` + +The creation of these labels is done in the [BachelorsThesis](https://github.com/KGB99/BachelorThesis) repo and the format itself and the contents are explained more deeply there. diff --git a/sampling_experiments.py b/sampling_experiments.py index fd5583404..9544826e4 100644 --- a/sampling_experiments.py +++ b/sampling_experiments.py @@ -356,7 +356,7 @@ def create_mask_annotation(image_path,APPROX): IOU_THRESHOLD = 0.2 # threshold for how much iou the SA-masks need with yolact preds to be included in result SCORE_THRESHOLD = 0.2 # threshold for yolact model TOP_K = 5 # top-k for yolact model -SEGMENT_SAMPLE = False # use the sample method of segment anything, currently not implemented +SEGMENT_SAMPLE = False # use the sample method of segment anything, currently not implemented! SEGMENT_EVERYTHING = True # use segment everything mode of segment anything USE_YOLACT = True # keep true unless you do not intend to load the yolact model TAKE_MAX_PREDS = True # takes at most the highest scoring prediction per class, none if no prediction @@ -374,7 +374,6 @@ def create_mask_annotation(image_path,APPROX): def create_labels(results_path): print('creating labels...') VISUALIZE_GEN_MASKS = args.visualize_masks - # e.g:print(results_path) = /cluster/project/infk/cvg/heinj/students/kbirgi/generating_masks/pbr_base_30000 base_img_path = (f"{results_path}/mask_images") base_yolact_path = (f"{results_path}/yolact_images") base_sam_path = (f"{results_path}/sam_images") @@ -395,8 +394,6 @@ def create_labels(results_path): len_coco_dict = len(coco_dict) passed = False for i,camera in enumerate(coco_dict): - - #print("BEWARE: USING CHOSEN SCENES FROM CODE! NO OTHER CAMERA ANGLES WILL BE PROCESSED!") if not (camera in CHOSEN_SCENES): continue if not (camera in labels_dict): @@ -405,15 +402,6 @@ def create_labels(results_path): camera_dict = coco_dict[camera] len_camera_dict = len(camera_dict) for j,imageId in enumerate(camera_dict): - if not ('011700.png' in camera_dict[imageId]['img']['file_name']): - continue - #print(camera_dict[imageId]['img']['file_name']) - #if ((not passed) & ((j+1) < start_img)): - # continue - #passed = True - - #if ((j % stride) != 0): - # continue start_time = time.time() print(f"Camera:{i+1:2}/{len_coco_dict}" @@ -501,19 +489,8 @@ def create_labels(results_path): for yolact_pred in yolact_preds: yolact_pred['blobs'] = calculate_blobs(yolact_pred['mask']) if CROP_MASKS_PERSONAL: - #temp_image_bool = (yolact_preds[0]['mask']).astype(bool) - #temp_image = (cv2.cvtColor(temp_image_bool.astype(np.uint8), cv2.COLOR_GRAY2BGR)) * np.array([0,0,255], dtype=np.uint8) - #temp_res = cv2.addWeighted(image, 0.5, temp_image, 1, 0) - #cv2.imwrite('./testerOutput/noCropMasks.png', temp_res) yolact_preds = crop_masks(yolact_preds, h, w) - #res_mask = np.zeros_like(temp_image_bool) - #for blob in yolact_preds[0]['blobs']: - # res_mask = np.logical_or(res_mask, blob) - #temp_image_bool = (res_mask).astype(bool) - #temp_image = (cv2.cvtColor(temp_image_bool.astype(np.uint8), cv2.COLOR_GRAY2BGR)) * np.array([0,0,255], dtype=np.uint8) - #temp_res = cv2.addWeighted(image, 0.5, temp_image, 1, 0) - #cv2.imwrite('./testerOutput/personalCropMasks.png', temp_res) - #exit() + crop_time_end = time.time() print(f' Cropping={int(crop_time_end - crop_time_begin):2}s, ', end='') @@ -531,11 +508,7 @@ def create_labels(results_path): sa_masks = segmentEverything(img_path, anything_generator) sa_time_end = time.time() print(f'SA={int(sa_time_end - sa_time_begin):2}s, ', end='') - - #plt.imshow(image) - #show_anns(sa_masks) - #plt.savefig(sam_results_path) - #plt.close() + plt.imshow(image) img = show_anns(sa_masks) @@ -685,10 +658,7 @@ def create_labels(results_path): print(" Done.", flush=True) with torch.no_grad(): cudnn.fastest = True - torch.set_default_tensor_type('torch.cuda.FloatTensor') - #else: - # torch.set_default_tensor_type('torch.FloatTensor') - # dataset = None + torch.set_default_tensor_type('torch.cuda.FloatTensor') print('Loading YOLACT Model...', end='',flush=True) net = Yolact() @@ -746,27 +716,13 @@ def create_labels(results_path): #results = {} len_coco_dict = len(coco_dict) - passed = False for i,camera in enumerate(coco_dict): - if ((not passed) & ((i+1) < start_cam)): - continue - #print("BEWARE: USING CHOSEN SCENES FROM CODE! NO OTHER CAMERA ANGLES WILL BE PROCESSED!") - #if not (camera in CHOSEN_SCENES): - # continue camera_dict = coco_dict[camera] len_camera_dict = len(camera_dict) camera_results = {} for j,imageId in enumerate(camera_dict): - print(camera_dict[imageId]['img']['file_name']) - if ((MAX_IMAGE != 0) & (j > MAX_IMAGE)): - print("FINISHING EARLY!") - exit() - if ((not passed) & ((j+1) < start_img)): - continue - passed = True - if ((j % stride) != 0): continue @@ -775,9 +731,7 @@ def create_labels(results_path): " | Image:" + str(j+1) + "/" + str(len_camera_dict), end = '') if SAVE_PLOTS: - #fig, axs = plt.subplots(1,2, figsize=(10,6)) # figsize=(10,6) fig, axs = plt.subplots(2,2) #figsize=(10,5)) - #fig.subplots_adjust(bottom=0.15, hspace=0.3) # Adjust bottom margin to create space for the description img_dict = camera_dict[imageId]['img'] # keys: ['id', 'width', 'height', 'file_name'] mask_dict = camera_dict[imageId]['mask'] # keys: ['segmentation', 'bbox', 'area', 'iscrowd', 'image_id', 'category_id', 'id'] @@ -845,19 +799,9 @@ def create_labels(results_path): for yolact_pred in yolact_preds: yolact_pred['blobs'] = calculate_blobs(yolact_pred['mask']) if CROP_MASKS_PERSONAL: - #temp_image_bool = (yolact_preds[0]['mask']).astype(bool) - #temp_image = (cv2.cvtColor(temp_image_bool.astype(np.uint8), cv2.COLOR_GRAY2BGR)) * np.array([0,0,255], dtype=np.uint8) - #temp_res = cv2.addWeighted(image, 0.5, temp_image, 1, 0) - #cv2.imwrite('./testerOutput/noCropMasks.png', temp_res) + # this is the alternative cropping algo i created, however results show that its not better + # and it just costs a lot of computation for no real gain yolact_preds = crop_masks(yolact_preds, h, w) - #res_mask = np.zeros_like(temp_image_bool) - #for blob in yolact_preds[0]['blobs']: - # res_mask = np.logical_or(res_mask, blob) - #temp_image_bool = (res_mask).astype(bool) - #temp_image = (cv2.cvtColor(temp_image_bool.astype(np.uint8), cv2.COLOR_GRAY2BGR)) * np.array([0,0,255], dtype=np.uint8) - #temp_res = cv2.addWeighted(image, 0.5, temp_image, 1, 0) - #cv2.imwrite('./testerOutput/personalCropMasks.png', temp_res) - #exit() crop_time_end = time.time() print(' Cropping=' + str(int(crop_time_end - crop_time_begin)) + 's, ', end='') @@ -881,7 +825,6 @@ def create_labels(results_path): if SAVE_SA: if SAVE_PLOTS: - #prep_path(sa_path, img_dict['file_name']) axs[0,0].imshow(image) img = show_anns(sa_masks) axs[0,0].imshow(img) @@ -896,9 +839,6 @@ def create_labels(results_path): sa_time_end = time.time() print('SA=' + str(int(sa_time_end - sa_time_begin)) + 's, ', end='') - #plt.imshow(image) - #show_anns(sa_masks) - #plt.savefig('testerOutput/SegmentAnything_image.png') results_time_begin = time.time() result_masks = [] for k,yolact_pred in enumerate(yolact_preds): @@ -943,9 +883,7 @@ def create_labels(results_path): axs[1,1].imshow(gen_mask_image) axs[1,1].axis('off') axs[1,1].set_title('Generated Mask') - - #plt.savefig(result_pred_path, bbox_inches='tight', dpi=300) - #plt.close(fig) + else: cv2.imwrite(result_pred_path, result_image) masks_iou = round(calculateIoU(gt_mask_bool, result_mask), 2) @@ -956,15 +894,9 @@ def create_labels(results_path): 'iou' : masks_iou #leave out the masks for now } - #fig.suptitle('IoU(Generated Mask, Ground Truth Mask) = ' + str(masks_iou), fontsize=10, x=0.5, y=0.05) - #plt.figtext(0.5, 0.01, 'IoU(Generated Mask, Ground Truth Mask) = ' + str(masks_iou), fontsize=10, ha='center') + plt.savefig(result_pred_path, bbox_inches='tight', dpi=300) - - #if CREATE_TRAINING_LABELS: - # gen_mask_final = (cv2.cvtColor(gen_mask_bool.astype(np.uint8), cv2.COLOR_GRAY2BGR)) * np.array([255,255,255]) - # cv2.imwrite('./testerOutput/generated_mask_final.png', gen_mask_final) - # exit() - #camera_results[img_dict['id']][str(k)] = results[img_dict['id']][str(k)] + plt.close('all') results_time_end = time.time() print(' Storing=' + str(int(results_time_end - results_time_begin)) + 's' , end='') @@ -972,111 +904,8 @@ def create_labels(results_path): print(' | Total time: ' + str(int(end_time - start_time)) + 's' , flush=True) f = open(temp_results_path + '/' + results_dir + '/camera_' + camera + '.json', 'w') - #print(camera_results) - #print(type(camera_results)) + json.dump(camera_results, f, indent=2) f.close() - #f = open(temp_results_path + '/' + results_dir + '/all_results.json', 'w') - #f.write(json.dumps(results)) - #f.close() - print('OK!') - - exit() - - - - #everything that follows is old code that i might need again - if False: - image = cv2.imread(img_path) - h, w, colors_dimension = image.shape - - keepers=[] - for i,score in enumerate(pred_score): - if score < SCORE_THRESHOLD: - continue - keepers.append(i) - - if False: - # assuming bbox is in (x,y,w,h) in relation to total image width - pr_x = int(pred_bbox[i][0].item() * w) - pr_y = int(pred_bbox[i][1].item() * h) - pr_h = int(pred_bbox[i][2].item() * w) - pr_w = int(pred_bbox[i][3].item() * h) - - bbox_image = np.zeros_like(image) - #print(pr_x,pr_y,pr_h,pr_w) - cv2.rectangle(bbox_image, (pr_x, pr_y), (pr_h, pr_w), (0,0,255),3) - bbox_result = cv2.addWeighted(image, 1, bbox_image, 0.5, 0) - - mask_image = np.zeros_like(image) - print(pred_mask[i].shape) - print(pred_proto[i].shape) - mask_result = cv2.addWeighted(image, 1, mask_image, 0.5, 0) - cv2.imwrite("./testerOutput/" + str(i) + "_" + str(j) + ".png", mask_result) - exit() - - - - - #for id in keepers: - #print(pred_mask[id]) - #print(pred_bbox[id]) - - - # TODO: visualize prediction - img_numpy = prep_display(preds, frame, None, None, undo_transform=False) - - #if save_path is None: - #img_numpy = img_numpy[:, :, (2, 1, 0)] - - #if save_path is None: - # plt.imshow(img_numpy) - # plt.title(path) - # plt.show() - #else: - - cv2.imwrite("./testerOutput/" + str(i) + "_" + str(j) + ".png", img_numpy) - exit() - # TODO: read gt_mask - - # TODO: sample a point from mask - - # TODO: show sampled point in image - - - image = cv2.imread(img_path) - #dets = preds[0] - #dets = dets['detection'] - #proto_data = dets['proto'] - #pred_bbox = dets['box'] - #pred_mask = dets['mask'] - #masks = proto_data @ pred_mask.t() - #masks = cfg.mask_proto_mask_activation(masks) - #masks = masks.permute(2, 0, 1).contiguous() - #print(masks.shape) - #exit() - #pred_class = dets['class'] - #pred_score = dets['score'] - #pred_proto = dets['proto'] - h,w,colors_dimension = frame.shape - t = postprocess(preds, w, h, visualize_lincomb = False, crop_masks = False, score_threshold = SCORE_THRESHOLD) - idx = t[1].argsort(0, descending=True)[:TOP_K] - masks = t[3][idx] - classes, scores, boxes = [x[idx].cpu().numpy() for x in t[:3]] - #cv2.imwrite('./testerOutput/puremasks.png',((masks[0] * torch.ones_like(masks[0])) * 255).cpu().numpy()) - bbox = boxes.squeeze() - bbox_image = np.zeros_like(image) - cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0,0,255),3) - bbox_result = cv2.addWeighted(image, 1, bbox_image, 0.5, 0) - masks_bitmask = (masks[0] * torch.ones_like(masks[0]) * 255).cpu().numpy() - contours = measure.find_contours(masks_bitmask, 0.5, positive_orientation='low') - print(len(contours)) - #masks_image = np.ones_like(image) - #masks_image = cv2.threshold(masks_bitmask, 128, 255, cv2.THRESH_BINARY) - #masks_image = cv2.cvtColor(masks_image, cv2.COLOR_BGR2GRAY) - #masks_image = cv2.threshold(masks_image, 128, 255, 0) - #img2,masks_contours,hierarchy = cv2.findContours(masks_image, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) - #print(len(masks_contours)) - exit() diff --git a/utils/augmentations.py b/utils/augmentations.py index 43efcb3f5..4627aaaf1 100644 --- a/utils/augmentations.py +++ b/utils/augmentations.py @@ -5,7 +5,9 @@ import types from numpy import random from math import sqrt -import imgaug.augmenters as iaa + +# uncomment this if you want to run the ablation study image augmentations (stronger hue, saturation, grayscale and noise) +# import imgaug.augmenters as iaa from data import cfg, MEANS, STD @@ -181,8 +183,8 @@ def __call__(self, image, masks, boxes, labels=None): class RandomSaturation(object): - # old value of lower was 0.5 - def __init__(self, lower=0.1, upper=1.5): + # value of lower for yolact-augmented = 0.1 + def __init__(self, lower=0.5, upper=1.7): self.lower = lower self.upper = upper assert self.upper >= self.lower, "contrast upper must be >= lower." @@ -190,16 +192,20 @@ def __init__(self, lower=0.1, upper=1.5): def __call__(self, image, masks=None, boxes=None, labels=None): if random.randint(2): - if random.randint(10): - image[:, :, 1] *= random.uniform(self.lower, self.upper) - else: + image[:, :, 1] *= random.uniform(self.lower, self.upper) + # uncomment the following lines and comment out the above line to introduce grayscale images: + #if random.randint(10): + # image[:, :, 1] *= random.uniform(self.lower, self.upper) + #else: # we have a 0.1 * 0.5 percent chance that the image will become grayscale - image[:, :, 1] *= 0 - #image[:,:,1] *= 1.5 + # image[:, :, 1] *= 0 return image, masks, boxes, labels class RandomHue(object): + # value of delta for yolact-augmented = 100 + # (experiments were done in range of 100 to 180, + # in the end i settled on 100 as there doesnt seem to be much of a difference) def __init__(self, delta=18.0): assert delta >= 0.0 and delta <= 360.0 self.delta = delta @@ -207,7 +213,6 @@ def __init__(self, delta=18.0): def __call__(self, image, masks=None, boxes=None, labels=None): if (random.randint(2) + 1): image[:, :, 0] += random.uniform(-self.delta, self.delta) - #image[:, :, 0] += -180 image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 return image, masks, boxes, labels @@ -220,29 +225,12 @@ def __init__(self): (2, 0, 1), (2, 1, 0)) def __call__(self, image, masks=None, boxes=None, labels=None): - # Don't shuffle the channels please, why would you do this - - # if random.randint(2): - # swap = self.perms[random.randint(len(self.perms))] - # shuffle = SwapChannels(swap) # shuffle channels - # image = shuffle(image) - - # to cast to uint8 without alterations from float32 we need to do two checks: - #check1= check that 0 <= x <= 255 - #check1 = np.all((image >= 0) & (image <= 255)) - #check2 = check that theres no decimals - #check2 = np.all(image == np.floor(image)) - #print("hello") - #print(check1) - #print(check2) - #if (check1 & check2): - # print("adding noise") - #image_uint8 = image.astype(np.uint8) - if random.randint(2): - noise_severity = random.randint(low=1, high=3) - aug = iaa.imgcorruptlike.GaussianNoise(severity = 2) - #image = ((aug(images=[image_uint8]))[0])#.astype(np.float32) - image = ((aug(images=[image]))[0]) + # the original code by bolya had nothing in this function as well. + # uncomment the following lines to introduce noise to images: + #if random.randint(2): + # noise_severity = random.randint(low=1, high=3) + # aug = iaa.imgcorruptlike.GaussianNoise(severity = 2) + # image = ((aug(images=[image]))[0]) return image, masks, boxes, labels @@ -277,7 +265,8 @@ def __call__(self, image, masks=None, boxes=None, labels=None): class RandomBrightness(object): - def __init__(self, delta=32): + # yolact-augmented value of delta is 32 + def __init__(self, delta=10): assert delta >= 0.0 assert delta <= 255.0 self.delta = delta