split_to_chunks.py

"""
This script can optionally be used to split a large stack
of images/flipbooks into smaller chunks for proofreading. E.g.,
A stack of 500 flipbooks can be split into 10 chunks or 50 flipbooks
and distributed to different proofreaders for faster turnaround.

Arguments:
----------
im_file: The image stack .tif generated by create_proofreading_stacks.py
mask_file: The mask stack .tif generated by create_proofreading_stacks.py
csv_file: The _consensus_attributes.csv file generated by create_proofreading_stacks.py
save_dir: Where to save the chunks.
cs: Number of images/flipbooks per chunk. Default 50.

Returns:
--------
An image and mask stack as well as an attrs .csv for each chunk.

Note: Chunks can be proofread masks can restacked into a single file with concat_mask_chunks.py.
Retain the unchunked im_file and csv_file if you intend to restack the masks later.

"""

import os
import argparse
import pandas as pd
from glob import glob
from skimage import io

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('im_file', type=str,
                        help='Path to stacked images/flipbooks')
    parser.add_argument('mask_file', type=str,
                        help='Path to stacked masks/mask flipbooks')
    parser.add_argument('csv_file', type=str,
                        help='Metadata consensus_attributes.csv file corresponding to images and masks')
    parser.add_argument('save_dir', type=str,
                        help='Directory in which to save chunked stacks of images and masks')
    parser.add_argument('--cs', type=int, default=50,
                        help='Number of images/flipbooks in each stack.')
    args = parser.parse_args()
    
    imf = args.im_file
    segf = args.mask_file
    sdir = args.save_dir
    chunk_size = args.cs
    csvf = args.csv_file

    os.makedirs(sdir, exist_ok=True)

    im = io.imread(imf)
    seg = io.imread(segf)
    attr_csv = pd.read_csv(csvf)

    # ranges of indices for each chunk
    start = 0
    stop = len(attr_csv)
    step = chunk_size
    sindices = range(start, stop, step)
    eindices = range(step, stop + step, step)
    batch_name = os.path.basename(imf).split('_')[0]

    for s,e in zip(sindices, eindices):
        s_str = str(s).zfill(4)
        e_str = str(e).zfill(4)
        
        # names record the range of images/flipbooks in the chunk
        impath = os.path.join(sdir, f'{batch_name}_chunk_{s_str}-{e_str}.tif')
        segpath = os.path.join(sdir, f'{batch_name}_chunk_{s_str}-{e_str}_masks.tif')
        csvpath = os.path.join(sdir, f'{batch_name}_attr_chunk_{s_str}-{e_str}.csv')
        
        io.imsave(impath, im[s:e], check_contrast=False)
        io.imsave(segpath, seg[s:e], check_contrast=False)

        # update and save the csv file chunk
        chunk_csv = attr_csv[s:e]
        chunk_csv[:, 'stack_index'] = chunk_csv['stack_index'] - s
        chunk_csv.to_csv(csvpath, index=False)