ENH: Allow variable number of columns in input CSV

Affects batchprocessing.py and commandlinebatch.py (used from command line as `pyradiomicsbatch`). The csv file used to define all combinations of image and mask should start now with a header line, which should contain at least the columns 'Image' and 'Mask' (specifying the location of image and mask, respectively). Additional columns can be added, these will be copied to the output. Additionally, fix some small bugs in the command line scripts, and add option to shorten image and mask path in output to just the file name (by default stores full path as defined in the CSV).
AIM-Harvard · Mar 29, 2017 · e7e2fe6 · e7e2fe6
1 parent 0c928b7
commit e7e2fe6
Show file tree

Hide file tree

Showing 7 changed files with 72 additions and 69 deletions.
diff --git a/bin/batchExamples/batchprocessing.py b/bin/batchExamples/batchprocessing.py
@@ -5,7 +5,6 @@
 import csv
 import logging
 import os
-import traceback
 
 import SimpleITK as sitk
 
@@ -35,21 +34,20 @@ def main():
   logger = rLogger.getChild('batch')
 
   # Set verbosity level for output to stderr (default level = WARNING)
-  # radiomics.setVerbosity(logging.INFO)
+  radiomics.setVerbosity(logging.INFO)
 
   logger.info('Loading CSV')
-  print("Loading CSV")
 
   flists = []
   try:
     with open(inputCSV, 'r') as inFile:
-      cr = csv.reader(inFile, lineterminator='\n')
+      cr = csv.DictReader(inFile, lineterminator='\n')
       flists = [row for row in cr]
   except Exception:
-    logging.error('CSV READ FAILED:\n%s', traceback.format_exc())
+    logging.error('CSV READ FAILED', exc_info=True)
 
-  print("Loading Done")
-  print("Patients: " + str(len(flists)))
+  logger.info('Loading Done')
+  logger.info('Patients: %d', len(flists))
 
   kwargs = {}
   kwargs['binWidth'] = 25
@@ -68,20 +66,15 @@ def main():
 
   for idx, entry in enumerate(flists, start=1):
 
-    print("(%d/%d) Processing Patient: %s, Study: %s, Reader: %s" % (idx, len(flists), entry[0], entry[1], entry[2]))
-    logger.info("(%d/%d) Processing Patient: %s, Study: %s, Reader: %s", idx, len(flists), entry[0], entry[1],
-                entry[2])
+    logger.info("(%d/%d) Processing Patient (Image: %s, Mask: %s)", idx, len(flists), entry['Image'], entry['Mask'])
 
-    imageFilepath = entry[3]
-    maskFilepath = entry[4]
+    imageFilepath = entry['Image']
+    maskFilepath = entry['Mask']
 
     if (imageFilepath is not None) and (maskFilepath is not None):
-      featureVector = collections.OrderedDict()
-      featureVector['PatientID'] = entry[0]
-      featureVector['Study'] = entry[1]
-      featureVector['Reader'] = entry[2]
-      featureVector['image'] = os.path.basename(imageFilepath)
-      featureVector['mask'] = os.path.basename(maskFilepath)
+      featureVector = collections.OrderedDict(entry)
+      featureVector['Image'] = os.path.basename(imageFilepath)
+      featureVector['Mask'] = os.path.basename(maskFilepath)
 
       try:
         featureVector.update(extractor.execute(imageFilepath, maskFilepath))
@@ -97,7 +90,7 @@ def main():
             row.append(featureVector.get(h, "N/A"))
           writer.writerow(row)
       except Exception:
-        logger.error('FEATURE EXTRACTION FAILED:\n%s', traceback.format_exc())
+        logger.error('FEATURE EXTRACTION FAILED', exc_info=True)
 
 if __name__ == '__main__':
   main()
diff --git a/docs/faq.rst b/docs/faq.rst
@@ -66,14 +66,11 @@ Usage
 **How should the input file for** ``pyradiomicsbatch`` **be structured?**
 
 Currently, the input file for ``pyradiomicsbatch`` is a csv file specifying the combinations of images and masks for
-which to extract features. It does not contain a header line, and each line represents one such combination.
-Each line has 5 elements: Patient, Image, Mask, Image location, Mask location. Only the last two elements are 'active'
-(used during extraction), the are the locations of the image and mask files on the computer. The first three elements
-are copied to the output and are there to enable easy clustering of the results or correlation to outcome. These three
-elements are chosen, because patients can have multiple scans (or multiple sequences, e.g. in MRI), and each image can
-be segmented by different readers. As they are not actively used by PyRadiomics, any value is valid, including empty
-values, as long as they are provided (e.g. ",,,<path/to/image>,<path/to/mask>" is valid,
-"<path/to/image>,<path/to/mask>" is not).
+which to extract features. It must contain a header line, where at least header "Image" and "Mask" should be specified
+(capital sensitive). These identify the columns that contain the file location of the image and the mask, respectively.
+Each subsequent line represents one combination of an image and a mask. Additional columns are also allowed, these are
+copied to the output in the same order as the input, with the additional columns of the calculated features appended
+at the end. *N.B. All header names should be unique and not match any of the produced header names by pyradiomics.*
 
 **I installed PyRadiomics, but when I run the jupyter notebook, I get** ``ImportError: No module named radiomics``
 

diff --git a/docs/usage.rst b/docs/usage.rst
@@ -75,9 +75,15 @@ Command Line Use
 
     pyradiomicsbatch <path/to/input> <path/to/output>
 
-* The input file for batch processing is a CSV file where each row represents one combination of an image and a
-  segmentation and contains 5 elements: 1) patient ID, 2) sequence name (image identifier), 3) reader (segmentation
-  identifier), 4) path/to/image, 5) path/to/mask.
+* The input file for batch processing is a CSV file where the first row is contains headers and each subsequent row
+  represents one combination of an image and a segmentation and contains at least 2 elements: 1) path/to/image,
+  2) path/to/mask. The headers specify the column names and **must** be "Image" and "Mask" for image and mask location,
+  respectively (capital sensitive). Additional columns may also be specified, all columns are copied to the output in
+  the same order (with calculated features appended after last column).
+
+  .. note::
+
+    All headers should be unique and different from headers provided by PyRadiomics (``<filter>_<class>_<feature>``).
 
 * For more information on passing parameter files, setting up logging and controlling output format, run::
 

diff --git a/radiomics/__init__.py b/radiomics/__init__.py
@@ -7,7 +7,6 @@
 import os
 import pkgutil
 import sys
-import traceback
 
 import numpy  # noqa: F401
 
@@ -184,7 +183,7 @@ def getInputImageTypes():
   _cMatsState = 1
   enableCExtensions()
 except Exception:
-  logger.warning("Error loading C extensions, switching to python calculation:\n%s", traceback.format_exc())
+  logger.warning("Error loading C extensions, switching to python calculation:", exc_info=True)
   cMatrices = None  # set cMatrices to None to prevent an import error in the feature classes.
   cShape = None
 

diff --git a/radiomics/featureextractor.py b/radiomics/featureextractor.py
@@ -361,12 +361,16 @@ def enableFeaturesByName(self, **enabledFeatures):
 
   def execute(self, imageFilepath, maskFilepath, label=None):
     """
-    Compute radiomics signature for provide image and mask combination.
-    First, image and mask are loaded and normalized/resampled if necessary. Second, if enabled, provenance information
-    is calculated and stored as part of the result. Next, shape features are calculated on a cropped (no padding)
-    version of the original image. Then other featureclasses are calculated using all specified input images in
-    ``inputImages``. Images are cropped to tumor mask (no padding) after application of any filter and before being
-    passed to the feature class. Finally, the dictionary containing all calculated features is returned.
+    Compute radiomics signature for provide image and mask combination. It comprises of the following steps:
+
+    1. Image and mask are loaded and normalized/resampled if necessary.
+    2. Validity of ROI is checked using :py:func:`~imageoperations.checkMask`, which also computes and returns the
+       bounding box.
+    3. If enabled, provenance information is calculated and stored as part of the result.
+    4. Shape features are calculated on a cropped (no padding) version of the original image.
+    5. Other enabled featureclasses are calculated using all specified input image types in ``inputImages``. Images are
+       cropped to tumor mask (no padding) after application of any filter and before being passed to the feature class.
+    6. The calculated features is returned as ``collections.OrderedDict``.
 
     :param imageFilepath: SimpleITK Image, or string pointing to image file location
     :param maskFilepath: SimpleITK Image, or string pointing to labelmap file location
@@ -386,14 +390,15 @@ def execute(self, imageFilepath, maskFilepath, label=None):
     self.logger.debug('Enabled features: %s', self.enabledFeatures)
     self.logger.debug('Current settings: %s', self.kwargs)
 
+    # 1. Load the image and mask
     featureVector = collections.OrderedDict()
     image, mask = self.loadImage(imageFilepath, maskFilepath)
 
     if image is None or mask is None:
       # No features can be extracted, return the empty featureVector
       return featureVector
 
-    # Check whether loaded mask contains a valid ROI for feature extraction
+    # 2. Check whether loaded mask contains a valid ROI for feature extraction and get bounding box
     boundingBox = imageoperations.checkMask(image, mask, **self.kwargs)
 
     if boundingBox is None:
@@ -402,10 +407,11 @@ def execute(self, imageFilepath, maskFilepath, label=None):
 
     self.logger.debug('Image and Mask loaded and valid, starting extraction')
 
+    # 3. Add the additional information if enabled
     if self.kwargs['additionalInfo']:
       featureVector.update(self.getProvenance(imageFilepath, maskFilepath, mask))
 
-    # If shape should be calculation, handle it separately here
+    # 4. If shape descriptors should be calculated, handle it separately here
     if 'shape' in self.enabledFeatures.keys():
       croppedImage, croppedMask = imageoperations.cropToTumorMask(image, mask, boundingBox, self.kwargs['label'])
       enabledFeatures = self.enabledFeatures['shape']
@@ -423,6 +429,7 @@ def execute(self, imageFilepath, maskFilepath, label=None):
         newFeatureName = 'original_shape_%s' % (featureName)
         featureVector[newFeatureName] = featureValue
 
+    # 5. Calculate other enabled feature classes using enabled input image types
     # Make generators for all enabled input image types
     self.logger.debug('Creating input image type iterator')
     imageGenerators = []

diff --git a/radiomics/scripts/commandline.py b/radiomics/scripts/commandline.py
@@ -7,7 +7,6 @@
 import logging
 import os.path
 import sys
-import traceback
 
 import six
 
@@ -26,19 +25,21 @@
                          'Default is "txt": one feature per line in format "name:value". For "csv": one row of feature '
                          'names, followed by one row of feature values. For "json": Features are written in a JSON '
                          'format dictionary "{name:value}"')
-parser.add_argument('--param', '-p', metavar='FILE', nargs=1, type=str, default=None,
+parser.add_argument('--param', '-p', metavar='FILE', type=str, default=None,
                     help='Parameter file containing the settings to be used in extraction')
-parser.add_argument('--label', '-l', metavar='N', nargs=1, default=None, type=int,
+parser.add_argument('--label', '-l', metavar='N', default=None, type=int,
                     help='Value of label in mask to use for feature extraction')
 parser.add_argument('--logging-level', metavar='LEVEL',
                     choices=['NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
                     default='WARNING', help='Set capture level for logging')
-parser.add_argument('--log-file', metavar='FILE', nargs=1, type=argparse.FileType('w'), default=None,
+parser.add_argument('--log-file', metavar='FILE', type=argparse.FileType('w'), default=None,
                     help='File to append logger output to')
 parser.add_argument('--verbosity', '-v', action='store', nargs='?', default=3, const=4, type=int, choices=range(0, 6),
                     help='Regulate output to stderr. By default [3], level WARNING and up are printed. By specifying '
                     'this argument without a value, level INFO [4] is assumed. A higher value results in more verbose '
                     'output.')
+parser.add_argument('--shorten-path', dest='shorten', action='store_true',
+                    help='specify this argument to image and mask path to just the file names')
 parser.add_argument('--version', action='version', help='Print version and exit',
                     version='%(prog)s ' + radiomics.__version__)
 
@@ -66,14 +67,15 @@ def main():
   # Initialize extractor
   try:
     if args.param is not None:
-      extractor = featureextractor.RadiomicsFeaturesExtractor(args.param[0])
+      extractor = featureextractor.RadiomicsFeaturesExtractor(args.param)
     else:
       extractor = featureextractor.RadiomicsFeaturesExtractor()
     logger.info('Extracting features with kwarg settings: %s\n\tImage: %s\n\tMask: %s',
                 str(extractor.kwargs), os.path.abspath(args.image), os.path.abspath(args.mask))
     featureVector = collections.OrderedDict()
-    featureVector['image'] = os.path.basename(args.image)
-    featureVector['mask'] = os.path.basename(args.mask)
+    if args.shorten:
+      featureVector['Image'] = os.path.basename(args.image)
+      featureVector['Mask'] = os.path.basename(args.mask)
 
     featureVector.update(extractor.execute(args.image, args.mask, args.label))
 
@@ -88,7 +90,7 @@ def main():
       for k, v in six.iteritems(featureVector):
         args.out.write('%s: %s\n' % (k, v))
   except Exception:
-    logger.error('FEATURE EXTRACTION FAILED:\n%s', traceback.format_exc())
+    logger.error('FEATURE EXTRACTION FAILED', exc_info=True)
 
   args.out.close()
   if args.log_file is not None:

diff --git a/radiomics/scripts/commandlinebatch.py b/radiomics/scripts/commandlinebatch.py
@@ -6,35 +6,37 @@
 import json
 import logging
 import os.path
-import traceback
 
 import radiomics
 from radiomics import featureextractor
 
 parser = argparse.ArgumentParser(usage='%(prog)s In Out [Options]')
 parser.add_argument('inFile', metavar='In', type=argparse.FileType('r'),
-                    help='CSV file containing combinations of image and mask. Each row represents one combination with '
-                         'the following elements: (1) Patient Name, (2) Image type, (3) Reader, (4) Image location and '
-                         '(5) Mask location')
+                    help='CSV file containing combinations of image and mask. First row should contain the headers, '
+                         'where "Image" and "Mask" must be present and identify the image and mask locations, '
+                         'respectively. All columns present in CSV file are copied to the output, this enables '
+                         'specification of additional identifiers, such as patient ID, Study ID and Reader.')
 parser.add_argument('outFile', metavar='Out', type=argparse.FileType('w'),
                     help='File to write results to')
 parser.add_argument('--format', '-f', choices=['csv', 'json'], default='csv', help='Format for the output. '
                     'Default is "csv": one row of feature names, followed by one row of feature values for each '
                     'image-mask combination. For "json": Features are written in a JSON format dictionary '
                     '"{name:value}", one line per image-mask combination')
-parser.add_argument('--param', '-p', metavar='FILE', nargs=1, type=str, default=None,
+parser.add_argument('--param', '-p', metavar='FILE', type=str, default=None,
                     help='Parameter file containing the settings to be used in extraction')
-parser.add_argument('--label', '-l', metavar='N', nargs=1, default=None, type=int,
+parser.add_argument('--label', '-l', metavar='N', default=None, type=int,
                     help='Value of label in mask to use for feature extraction')
 parser.add_argument('--logging-level', metavar='LEVEL',
                     choices=['NOTSET', 'DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
                     default='WARNING', help='Set capture level for logging')
-parser.add_argument('--log-file', metavar='FILE', nargs=1, type=argparse.FileType('w'), default=None,
+parser.add_argument('--log-file', metavar='FILE', type=argparse.FileType('w'), default=None,
                     help='File to append logger output to')
 parser.add_argument('--verbosity', '-v', action='store', nargs='?', default=3, const=4, type=int, choices=range(0, 6),
                     help='Regulate output to stderr. By default [3], level WARNING and up are printed. By specifying '
                     'this argument without a value, level INFO [4] is assumed. A higher value results in more verbose '
                     'output.')
+parser.add_argument('--shorten-path', dest='shorten', action='store_true',
+                    help='specify this argument to image and mask path to just the file names')
 parser.add_argument('--version', action='version', help='Print version and exit',
                     version='%(prog)s ' + radiomics.__version__)
 
@@ -63,11 +65,11 @@ def main():
   # Load patient list
   flists = []
   try:
-    cr = csv.reader(args.inFile, lineterminator='\n')
+    cr = csv.DictReader(args.inFile, lineterminator='\n')
     flists = [row for row in cr]
     args.inFile.close()
   except Exception:
-    logging.error('CSV READ FAILED:\n%s', traceback.format_exc())
+    logging.error('CSV READ FAILED', exc_info=True)
     args.inFile.close()
     args.outFile.close()
     if args.log_file is not None:
@@ -78,11 +80,11 @@ def main():
   try:
     logger.debug("Initializing extractor")
     if args.param is not None:
-      extractor = featureextractor.RadiomicsFeaturesExtractor(args.param[0])
+      extractor = featureextractor.RadiomicsFeaturesExtractor(args.param)
     else:
       extractor = featureextractor.RadiomicsFeaturesExtractor()
   except Exception:
-    logger.error('EXTRACTOR INITIALIZATION FAILED:\n%s', traceback.format_exc())
+    logger.error('EXTRACTOR INITIALIZATION FAILED', exc_info=True)
     args.outFile.close()
     args.log_file.close()
     exit(-1)
@@ -93,19 +95,16 @@ def main():
   headers = None
   for idx, entry in enumerate(flists, start=1):
 
-    logger.info("(%d/%d) Processing Patient: %s, Study: %s, Reader: %s", idx, len(flists), entry[0], entry[1],
-                entry[2])
+    logger.info("(%d/%d) Processing Patient (Image: %s, Mask: %s)", idx, len(flists), entry['Image'], entry['Mask'])
 
-    imageFilepath = entry[3]
-    maskFilepath = entry[4]
+    imageFilepath = entry['Image']
+    maskFilepath = entry['Mask']
 
     if (imageFilepath is not None) and (maskFilepath is not None):
       featureVector = collections.OrderedDict()
-      featureVector['PatientID'] = entry[0]
-      featureVector['Study'] = entry[1]
-      featureVector['Reader'] = entry[2]
-      featureVector['image'] = os.path.basename(imageFilepath)
-      featureVector['mask'] = os.path.basename(maskFilepath)
+      if args.shorten:
+        featureVector['Image'] = os.path.basename(imageFilepath)
+        featureVector['Mask'] = os.path.basename(maskFilepath)
 
       try:
         featureVector.update(extractor.execute(imageFilepath, maskFilepath, args.label))
@@ -124,7 +123,7 @@ def main():
           json.dump(featureVector, args.out)
           args.out.write('\n')
       except Exception:
-        logger.error('FEATURE EXTRACTION FAILED:\n%s', traceback.format_exc())
+        logger.error('FEATURE EXTRACTION FAILED', exc_info=True)
 
   args.outFile.close()
   if args.log_file is not None: