flic

#!/usr/bin/python -B
# -*- coding: utf-8 -*-
###############################################################################
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
###############################################################################

"""
     flic - Format LICences.

    Usage:
        flic --input [input file] --template [template file] > [output file]

    Options:
        --template  Name of the template to use (in flic_templates dir)
        --input     License information JSON file (output of slic)
                    (Can be specified more than once)

    This script is quite specific to the needs of generating the license info
    from the B2G codebase for Firefox OS, but could be adapted. It does
    copyright holder canonicalization and amalgamation, for example, which
    not everyone may want.
"""

_version_ = (1, 0, 0)

import logging
logging.basicConfig(filename="flic.log")
log = logging.getLogger("flic")

import sys
import re
import json
import argparse
from os.path import dirname, exists, join, basename, split
from jinja2 import Environment, PackageLoader
import textwrap
import cgi

from slic_results import SlicResults
import utils

# A function used in the template
def template_exists(template):
     log.debug("Looking for template: %s" % template)
     path = join(dirname(sys.argv[0]), "flic_templates", template)
     return exists(path)


# A mapping to reduce the number of times you get a given copyright holder
# listed "twice" because their name has multiple forms
canonical_holders = {
    'Silicon Graphics': 'Silicon Graphics Computer Systems, Inc.',
    'The Regents of the University of California. All rights reserved.':
               'Regents of the University of California. All rights reserved.',
    'Mozilla Foundation.': 'Mozilla Foundation',
    'Android Open Source Project': 'The Android Open Source Project',
    'The Android Open Source Project All rights reserved.': 'The Android Open Source Project. All rights reserved.',
    'Student Information Processing Board of the Massachusetts Institute of Technology.':
        'by the Student Information Processing Board of the Massachusetts Institute of Technology',
    'World Wide Web Consortium, (Massachusetts Institute of Technology, Institut National de Recherche en Informatique et en Automatique, Keio University). All':
        'World Wide Web Consortium, (Massachusetts Institute of Technology, Institut National de Recherche en Informatique et en Automatique, Keio University). All Rights Reserved.',
    'Renesas Technology.': 'Renesas Technology',
}

def tidy_holder(holder):
    # Ick. Prevent "obvious" duplications of copyright holders
    if holder in canonical_holders:
        holder = canonical_holders[holder]
        
    return holder


# Take an array of integer years and turn it into a comma-separated string
# list of years and ranges
def _join_years(years):
    if not years:
        return ""
    
    # uniq
    set = {}
    map(set.__setitem__, years, [])
    years = set.keys()

    # sort
    years.sort()

    years_as_str = []
    range_start = None
    for i in range(len(years) - 1):
        if years[i + 1] == years[i] + 1:
            if not range_start:
                range_start = years[i]
        elif range_start:
            # No range continuation, pending value; finish range
            years_as_str.append("%i-%i" % (range_start, years[i]))
            range_start = None
        else:
            # No range continuation, no pending value
            years_as_str.append(str(years[i]))

    # Final year
    if range_start:
        # Pending value; finish range
        years_as_str.append("%i-%i" % (range_start, years[-1]))
    else:
        # No pending value
        years_as_str.append(str(years[-1]))

    return ", ".join(years_as_str)
            

# Take a string list of years and ranges and turn it into an array of integer
# years
def _split_years(string):
    if not string:
        return []
    
    years = []
    for piece in string.split(','):
        if re.search("^\s*$", piece):
            # Blank line
            continue

        cw_piece = utils.collapse(piece)
        
        if re.search("-", piece):
            # Range
            rng = piece.split('-')

            if not rng[0] or not rng[1]:
                continue
                
            if (re.search("^\s*$", rng[0])):
                continue

            if (re.search("^\s*$", rng[1])):
                years.append(_canonicalize_year(rng[0]))
                continue
            
            start = _canonicalize_year(rng[0])
            end = _canonicalize_year(rng[1])

            if start < 1970 or end > 2030:
                continue
                
            for i in range(start, end + 1):
                years.append(i)
        elif len(cw_piece) > 5:
            # Space-separated years? 5 rather than 4 to try and deal with
            # foolish typos such as "20010".
            sp_years = [_canonicalize_year(year) for year in piece.split()]
            years.extend(sp_years)
        elif len(cw_piece) == 4 or len(cw_piece) == 2:
            # Single year
            years.append(_canonicalize_year(piece))
        else:
            log.warning("Year with strange length: '%s'" % cw_piece) 

    return years


# Make string year an integer, and expand from 2 digits to 4 if necessary
def _canonicalize_year(year):
    assert year != ''
    year = int(year)
    
    if year > 100 and year < 1970:
        log.warning("Strange year: '%s'" % year) 
    
    if year < 100:
        if year > 70:
            year = 1900 + year
        else:
            year = 2000 + year

    return year

###############################################################################
# main()
###############################################################################
def main(argv):
    env = Environment(loader=PackageLoader('__main__', 'flic_templates'),
                      trim_blocks=True,
                      extensions=['jinja2.ext.do'])

    parser = argparse.ArgumentParser(description=\
                                    'Collate and process license information.')
    parser.add_argument('--template', metavar="<template>",
                       help='render the given template name')
    parser.add_argument('-i', '--input', metavar="<input file>", action="append",
                       help='JSON output of "slic" program')
    parser.add_argument('-V', '--version', action="store_true",
                       help='show version number and exit')
    parser.add_argument('-d', '--debug', action="store_true",
                       help='output debugging info to flic.log')

    args = parser.parse_args()

    if args.version:
        ver = '.'.join([str(part) for part in _version_])
        print "flic %s" % ver
        return

    if args.debug:
        log.setLevel(logging.DEBUG)
        
    # Load occurrence data file
    bytag = SlicResults()
    for filename in args.input:
        bytag.load_json(filename)

    # For some licenses, we have a specific set text and so even if small
    # variants are found, we choose to ignore them and amalgamate all the
    # files and copyright holders into a single entry
    single_entry_licenses = ['BSD-4-Clause_RTFM',
                             'BSDProtection',
                             'FTL_fulltext',
                             'IJG_urlref',
                             'BSD\|GPL-2.0',
                             'MirOS',
                             'BSD-3-Clause_urlref',
                             'W3C_urlref',
                             'MIT_urlref',
                             'BSL-1.0_urlref',
                            ]

    s_e_ls = bytag.pop_by_re(single_entry_licenses)
    s_e_ls.unify()
    bytag.update(s_e_ls)
    
    # Convert text to HTML, including inserting 2x<br> at para breaks for
    # formatting on small screens
    for data in bytag.itervalues():
        if not 'text' in data:
            continue

        html = "\n".join(data['text'])
        html = cgi.escape(html)
        
        # Empty line before bullets
        html = re.sub(r"\n\s*([0-9]\.|\*|-)", r"\n\n \1", html)

        # Empty line above acknowledgement text
        html = re.sub(r"(acknowledge?ment:\n)", r"\1\n", html)

        # While we're at it...
        html = re.sub("``", '&#8220;', html)
        html = re.sub("''", '&#8221;', html)

        # Replace all empty lines by double <br>s
        html = re.sub(r"(\n){2,}", '<br><br>', html)
        
        data['html'] = html

    # Post-process and amalgamate copyright lines
    # \xa9 is the copyright symbol
    copy_re = re.compile("""Copyright(s|ed)?:?\s*
                            (\(C\)|\xa9)?\s*
                            (?P<years>[-\d,\s]*)\s
                            (?P<holder>.*)$""",
                         re.IGNORECASE | re.VERBOSE)

    for data in bytag.itervalues():
        if not 'copyrights' in data:
            continue
        
        copyrights = data['copyrights']

        # Amalgamate years
        holders = {}
        for i in range(len(copyrights)):
            match = copy_re.search(copyrights[i])
            if match:
                hits = match.groupdict()
                log.info("Hits: %r" % hits)
                holder = tidy_holder(hits['holder'])
                years = _split_years(hits['years'])
                if holder in holders:
                    holders[holder].extend(years)
                else:
                    holders[holder] = years
            else:
                log.warning("(C) line doesn't match re: %s" % copyrights[i])

        # Rebuild copyright lines
        clean_copyrights = []
        for holder in holders:
            log.debug("Years: %r" % holders[holder])
            years = _join_years(holders[holder])
            spacer = " " if years else ""
            copyright = u"Copyright \xa9 %s%s%s" % (years,
                                                    spacer,
                                                    holder)
            log.debug("Clean C line: %s" % copyright)
            clean_copyrights.append(copyright)

        data['copyrights'] = clean_copyrights

    # Reconcile Bison exception, which can be in a different comment to the GPL
    # license and so is noted as a different "license" :-|
    def resolve_bison_exception(bisonfile):
        log.debug("Trying to resolve bisonexception for %s" % bisonfile)
        for tag in bytag:
            if not re.search("^GPL", tag):
                continue
            if re.search("^GPL-1\.0\+-with-bison-exception", tag):
                continue
                
            for data in bytag[tag]:
                gplfiles = data['files']
                for gplfile in gplfiles:
                    if gplfile == bisonfile:
                        log.info("Resolved bisonexception for %s" % bisonfile)
                        gplfiles.remove(gplfile)
                        
                        # Make sure GPL goes away if there are no GPLed files
                        # left
                        if not gplfiles:
                            bytag[tag].remove(data)

                        if not bytag[tag]:
                            del(bytag[tag])
                            
                        return True

        log.warning("Unable to resolve bisonexception for %s" % bisonfile)
        return False

    if 'GPL-1.0+-with-bison-exception' in bytag:
        for data in bytag['GPL-1.0+-with-bison-exception']:
            bisonfiles = data['files']
            for bisonfile in bisonfiles:
                resolve_bison_exception(bisonfile)

    # Sometimes a file header says "see file FOO for the license". We give all
    # such licenses in slic a tag which includes the string 'fileref'. We must
    # now go through all files so tagged and make sure that a corresponding
    # license file has been found and included.

    # Make a hash lookup table of all files found which match any of the special
    # filenames. It's always possible that the Android people will have renamed
    # the file to "NOTICE"...
    fileref_names = {
        'COPYING_fileref':     ['COPYING', 'NOTICE'],
        'COPYING_fileref2':    ['COPYING', 'NOTICE'],
        'COPYRIGHT_fileref':   ['COPYRIGHT', 'NOTICE'],
        'LICENSE_fileref':     ['LICENSE', 'NOTICE'],
        'bzip2-1.0.6_fileref': ['LICENSE', 'NOTICE'],
        'BSD-3-Clause_fileref_xiph': ['COPYING', 'NOTICE'],
        'BSD_fileref':         ['LICENSE', 'NOTICE', 'README'],
        'BSD_fileref_2':       ['LICENSE.txt'], # sic
        'BSD_fileref_3':       ['README'],
        'MIT_fileref':         ['COPYING', 'NOTICE'],
        'MIT|GPL-2.0_fileref': ['MIT-LICENSE.txt', 'NOTICE'],
        'FTL_fileref':         ['LICENSE.txt', 'docs/FTL.TXT', 'NOTICE'],
        'ISC_fileref':         ['LICENSE', 'NOTICE'],
        'IJG_fileref':         ['README'],        
        'jsimdext_fileref':    ['jsimdext.inc'],
        # .po files have some "same license as" boilerplate
        'po_fileref':          ['COPYING', 'LICENSE', 'NOTICE'],
        'Zlib_fileref':        ['zlib.h'],
        'Libpng_fileref':      ['png.h'],
        'LICENSE.txt_fileref': ['LICENSE.txt']
    }

    # Unique
    unique_filenames = {}
    for fileref in fileref_names:
        map(unique_filenames.__setitem__, fileref_names[fileref], [])

    license_files = {}
    license_files_re = re.compile("^(" + "|".join(unique_filenames.keys()) + ")")

    # Gather list of "LICENSE"-type files        
    for data in bytag.itervalues():
        files = data['files']
        for file in files:
            filename = basename(file)
            if license_files_re.match(filename):
                license_files[file] = data['tag']

    # For each file marked as having a "fileref" license, see if an appropriate
    # file in a higher directory is present and has been included somewhere.
    # Assuming it has, we can ignore the fileref file itself because the
    # license has been noted when we included the referred-to file.
    def find_license_file_for(file, license_file_names):
        log.debug("File path: %s" % file)
        for license_file_name in license_file_names:
            log.debug("Trying to find license file: %s" % license_file_name)
            dir = dirname(file)
            log.debug("Starting directory: %s" % dir)
            while dir != "." and dir != "" and dir != "./gecko":
                log.debug("Looking for %s" % join(dir, license_file_name))
                if join(dir, license_file_name) in license_files:
                    log.debug("Found license %s in dir: %s" % (license_file_name, dir))
                    return True
                # Up one level
                dir = re.sub("/$", "", dir)
                dir = dirname(dir)
                log.debug("Moving up to dir: %s" % dir)
            
        log.debug("Found no license file for %s" % file)
        return False

    fileref_problem_files = []
    fileref_problem_dirs = {}

    for tag in bytag:
        if not re.search("fileref", tag):
            continue

        if not tag in fileref_names:
            log.warning("No license file info for fileref tag '%s'" % tag)
            continue

        license_file_names = fileref_names[tag]
        
        for data in bytag[tag]:
            log.debug("Checking filerefs for tag %s" % tag)
            for file in data['files']:
                # We ignore some copies of header files which are also found
                # elsewhere
                log.info(file)
                if re.search("xulrunner-sdk", file):
                    continue
                    
                if not find_license_file_for(file, license_file_names):
                    fileref_problem_files.append(file)
                    fileref_problem_dirs[split(file)[0]] = 1

    # Render output
    if args.template:
        template = env.get_template(args.template)
        log.info("Rendering")
        print template.render({
            'licenses': bytag,
            'template_exists': template_exists,
            'fileref_problem_files': fileref_problem_files,
            'fileref_problem_dirs': fileref_problem_dirs,
            'license_files': license_files
        }).encode('utf-8')


if __name__ == "__main__":
    sys.exit(main(sys.argv))