From 6d76750b0e602a5ce07a7c74a046b56772a32e72 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Tue, 25 Oct 2016 13:48:33 +0100
Subject: [PATCH 01/18] fix #pragma coffee linear loop

---
 tsfc/driver.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tsfc/driver.py b/tsfc/driver.py
index e2d8b8c0..2b05d312 100644
--- a/tsfc/driver.py
+++ b/tsfc/driver.py
@@ -188,7 +188,7 @@ def compile_integral(integral_data, form_data, prefix, parameters,
         for i, quadrature_index in enumerate(quadrature_indices):
             index_names.append((quadrature_index, 'ip_%d' % i))
 
-    body = generate_coffee(impero_c, index_names, parameters["precision"], ir, argument_indices)
+    body = generate_coffee(impero_c, index_names, parameters["precision"], ir, tuple(chain(*argument_indices)))
 
     kernel_name = "%s_%s_integral_%s" % (prefix, integral_type, integral_data.subdomain_id)
     return builder.construct_kernel(kernel_name, body)

From d508d89b74fd8a06f1430644509efd8a750fda14 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 24 Oct 2016 16:51:19 +0100
Subject: [PATCH 02/18] delta elimination and sum factorisation

---
 gem/optimise.py | 145 +++++++++++++++++++++++++++++++++++++++++++++++-
 tsfc/fem.py     |   1 +
 2 files changed, 144 insertions(+), 2 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index 1c5c71da..8cb07e5c 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -2,17 +2,22 @@
 expressions."""
 
 from __future__ import absolute_import, print_function, division
-from six.moves import map
+from six.moves import map, range, zip
 
+from collections import deque
 from functools import reduce
+from itertools import permutations
+
+import numpy
 from singledispatch import singledispatch
 
 from gem.node import Memoizer, MemoizerArg, reuse_if_untouched, reuse_if_untouched_arg
 from gem.gem import (Node, Terminal, Failure, Identity, Literal, Zero,
-                     Sum, Comparison, Conditional, Index,
+                     Product, Sum, Comparison, Conditional, Index,
                      VariableIndex, Indexed, FlexiblyIndexed,
                      IndexSum, ComponentTensor, ListTensor, Delta,
                      partial_indexed)
+from gem.utils import OrderedSet
 
 
 @singledispatch
@@ -190,6 +195,142 @@ def select_expression(expressions, index):
     return ComponentTensor(selected, alpha)
 
 
+@singledispatch
+def _pull_delta_from_listtensor(node, self):
+    raise AssertionError("cannot handle type %s" % type(node))
+
+
+_pull_delta_from_listtensor.register(Node)(reuse_if_untouched)
+
+
+@_pull_delta_from_listtensor.register(ListTensor)
+def _pull_delta_from_listtensor_listtensor(node, self):
+    # Separate Delta nodes from other expressions
+    deltaz = []
+    rests = []
+
+    for child in node.children:
+        deltas = OrderedSet()
+        others = []
+
+        # Traverse Product tree
+        queue = deque([child])
+        while queue:
+            expr = queue.popleft()
+            if isinstance(expr, Product):
+                queue.extend(expr.children)
+            elif isinstance(expr, Delta):
+                assert expr not in deltas
+                deltas.add(expr)
+            else:
+                others.append(self(expr))  # looks for more ListTensors inside
+
+        deltaz.append(deltas)
+        rests.append(reduce(Product, others))
+
+    # Factor out common Delta factors
+    common_deltas = set.intersection(*[set(ds) for ds in deltaz])
+    deltaz = [[d for d in ds if d not in common_deltas] for ds in deltaz]
+
+    # Rebuild ListTensor
+    new_children = [reduce(Product, ds, rest)
+                    for ds, rest in zip(deltaz, rests)]
+    result = node.reconstruct(*new_children)
+
+    # Apply common Delta factors
+    if common_deltas:
+        alpha = tuple(Index(extent=d) for d in result.shape)
+        expr = reduce(Product, common_deltas, Indexed(result, alpha))
+        result = ComponentTensor(expr, alpha)
+    return result
+
+
+def pull_delta_from_listtensor(expression):
+    mapper = Memoizer(_pull_delta_from_listtensor)
+    return mapper(expression)
+
+
+def contraction(expression):
+    # Pull Delta nodes out of annoying ListTensors, and eliminate
+    # annoying ComponentTensors
+    expression, = remove_componenttensors([expression])
+    expression = pull_delta_from_listtensor(expression)
+    expression, = remove_componenttensors([expression])
+
+    # Flatten a product tree
+    sum_indices = []
+    factors = []
+
+    queue = deque([expression])
+    while queue:
+        expr = queue.popleft()
+        if isinstance(expr, IndexSum):
+            queue.append(expr.children[0])
+            sum_indices.append(expr.index)
+        elif isinstance(expr, Product):
+            queue.extend(expr.children)
+        else:
+            factors.append(expr)
+
+    # Try to eliminate Delta nodes
+    delta_queue = [(f, index)
+                   for f in factors if isinstance(f, Delta)
+                   for index in (f.i, f.j) if index in sum_indices]
+    while delta_queue:
+        delta, from_ = delta_queue[0]
+        to_, = list({delta.i, delta.j} - {from_})
+
+        sum_indices.remove(from_)
+
+        mapper = MemoizerArg(filtered_replace_indices)
+        factors = [mapper(e, ((from_, to_),)) for e in factors]
+
+        delta_queue = [(f, index)
+                       for f in factors if isinstance(f, Delta)
+                       for index in (f.i, f.j) if index in sum_indices]
+
+    # Drop ones
+    factors = [e for e in factors if e != Literal(1)]
+
+    # Sum factorisation
+    expression = None
+    best_flops = numpy.inf
+
+    for ordering in permutations(factors):
+        deps = [set(sum_indices) & set(factor.free_indices)
+                for factor in ordering]
+
+        scan_deps = [None] * len(factors)
+        scan_deps[0] = deps[0]
+        for i in range(1, len(factors)):
+            scan_deps[i] = scan_deps[i - 1] | deps[i]
+
+        sum_at = [None] * len(factors)
+        sum_at[0] = scan_deps[0]
+        for i in range(1, len(factors)):
+            sum_at[i] = scan_deps[i] - scan_deps[i - 1]
+
+        expr = None
+        flops = 0
+        for s, f in reversed(list(zip(sum_at, ordering))):
+            if expr is None:
+                expr = f
+            else:
+                expr = Product(f, expr)
+                flops += numpy.prod([i.extent for i in expr.free_indices], dtype=int)
+            if s:
+                flops += numpy.prod([i.extent for i in s])
+            for i in sum_indices:
+                if i in s:
+                    expr = IndexSum(expr, i)
+
+        if flops < best_flops:
+            expression = expr
+            best_flops = flops
+
+    return expression
+
+
 @singledispatch
 def _replace_delta(node, self):
     raise AssertionError("cannot handle type %s" % type(node))
diff --git a/tsfc/fem.py b/tsfc/fem.py
index f4f2be71..b3b70104 100644
--- a/tsfc/fem.py
+++ b/tsfc/fem.py
@@ -219,6 +219,7 @@ def callback(entity_id):
                          gem.Indexed(vec, alpha))
     for i in alpha:
         result = gem.IndexSum(result, i)
+    result = gem.optimise.contraction(result)
     if vi:
         return gem.ComponentTensor(result, vi)
     else:

From 106fd0284ecc590302871060fb01e95c78558ff4 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 24 Oct 2016 17:55:42 +0100
Subject: [PATCH 03/18] do not generate empty for loops

COFFEE loop merger does not like them.
---
 gem/impero.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/gem/impero.py b/gem/impero.py
index 9994e5ff..b990aa99 100644
--- a/gem/impero.py
+++ b/gem/impero.py
@@ -146,6 +146,13 @@ class For(Node):
     __slots__ = ('index', 'children')
     __front__ = ('index',)
 
+    def __new__(cls, index, statement):
+        assert isinstance(statement, Block)
+        if not statement.children:
+            return Noop(None)
+        else:
+            return super(For, cls).__new__(cls)
+
     def __init__(self, index, statement):
         self.index = index
         self.children = (statement,)

From 3b315929936ebcf6b692ac476a109af7429e8a68 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 24 Oct 2016 19:27:51 +0100
Subject: [PATCH 04/18] IndexSum with multiindex

---
 gem/gem.py            | 34 +++++++++++++++++-----------------
 gem/optimise.py       | 21 ++++++++++++---------
 tests/test_codegen.py |  2 +-
 tsfc/driver.py        |  8 ++++----
 tsfc/fem.py           |  7 ++-----
 tsfc/ufl2gem.py       |  4 ++--
 6 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/gem/gem.py b/gem/gem.py
index 5251e40f..1713fc68 100644
--- a/gem/gem.py
+++ b/gem/gem.py
@@ -553,26 +553,27 @@ def __new__(cls, expression, multiindex):
 
 
 class IndexSum(Scalar):
-    __slots__ = ('children', 'index')
-    __back__ = ('index',)
+    __slots__ = ('children', 'multiindex')
+    __back__ = ('multiindex',)
 
-    def __new__(cls, summand, index):
+    def __new__(cls, summand, multiindex):
         # Sum zeros
         assert not summand.shape
         if isinstance(summand, Zero):
             return summand
 
-        # Sum a single expression
-        if index.extent == 1:
-            return Indexed(ComponentTensor(summand, (index,)), (0,))
+        # No indices case
+        multiindex = tuple(multiindex)
+        if not multiindex:
+            return summand
 
         self = super(IndexSum, cls).__new__(cls)
         self.children = (summand,)
-        self.index = index
+        self.multiindex = multiindex
 
         # Collect shape and free indices
-        assert index in summand.free_indices
-        self.free_indices = unique(set(summand.free_indices) - {index})
+        assert set(multiindex) <= set(summand.free_indices)
+        self.free_indices = unique(set(summand.free_indices) - set(multiindex))
 
         return self
 
@@ -714,14 +715,13 @@ def unique(indices):
     return tuple(sorted(set(indices), key=id))
 
 
-def index_sum(expression, index):
-    """Eliminates an index from the free indices of an expression by
-    summing over it.  Returns the expression unchanged if the index is
-    not a free index of the expression."""
-    if index in expression.free_indices:
-        return IndexSum(expression, index)
-    else:
-        return expression
+def index_sum(expression, indices):
+    """Eliminates indices from the free indices of an expression by
+    summing over them.  Skips any index that is not a free index of
+    the expression."""
+    multiindex = tuple(index for index in indices
+                       if index in expression.free_indices)
+    return IndexSum(expression, multiindex)
 
 
 def partial_indexed(tensor, indices):
diff --git a/gem/optimise.py b/gem/optimise.py
index 8cb07e5c..7784e2ba 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -266,7 +266,7 @@ def contraction(expression):
         expr = queue.popleft()
         if isinstance(expr, IndexSum):
             queue.append(expr.children[0])
-            sum_indices.append(expr.index)
+            sum_indices.extend(expr.multiindex)
         elif isinstance(expr, Product):
             queue.extend(expr.children)
         else:
@@ -320,9 +320,7 @@ def contraction(expression):
                 flops += numpy.prod([i.extent for i in expr.free_indices], dtype=int)
             if s:
                 flops += numpy.prod([i.extent for i in s])
-            for i in sum_indices:
-                if i in s:
-                    expr = IndexSum(expr, i)
+            expr = IndexSum(expr, tuple(i for i in sum_indices if i in s))
 
         if flops < best_flops:
             expression = expr
@@ -387,13 +385,18 @@ def _unroll_indexsum(node, self):
 
 @_unroll_indexsum.register(IndexSum)  # noqa
 def _(node, self):
-    if node.index.extent <= self.max_extent:
+    unroll = tuple(index for index in node.multiindex
+                   if index.extent <= self.max_extent)
+    if unroll:
         # Unrolling
         summand = self(node.children[0])
-        return reduce(Sum,
-                      (Indexed(ComponentTensor(summand, (node.index,)), (i,))
-                       for i in range(node.index.extent)),
-                      Zero())
+        shape = tuple(index.extent for index in unroll)
+        unrolled = reduce(Sum,
+                          (Indexed(ComponentTensor(summand, unroll), alpha)
+                           for alpha in numpy.ndindex(shape)),
+                          Zero())
+        return IndexSum(unrolled, tuple(index for index in node.multiindex
+                                        if index not in unroll))
     else:
         return reuse_if_untouched(node, self)
 
diff --git a/tests/test_codegen.py b/tests/test_codegen.py
index af8c2f63..ee9a8d31 100644
--- a/tests/test_codegen.py
+++ b/tests/test_codegen.py
@@ -13,7 +13,7 @@ def test_loop_fusion():
 
     def make_expression(i, j):
         A = Variable('A', (6,))
-        s = IndexSum(Indexed(A, (j,)), j)
+        s = IndexSum(Indexed(A, (j,)), (j,))
         return Product(Indexed(A, (i,)), s)
 
     e1 = make_expression(i, j)
diff --git a/tsfc/driver.py b/tsfc/driver.py
index 2b05d312..eeaf1c21 100644
--- a/tsfc/driver.py
+++ b/tsfc/driver.py
@@ -93,6 +93,7 @@ def compile_integral(integral_data, form_data, prefix, parameters,
     argument_indices = tuple(tuple(gem.Index(extent=e)
                                    for e in create_element(arg.ufl_element()).index_shape)
                              for arg in arguments)
+    flat_argument_indices = tuple(chain(*argument_indices))
     quadrature_indices = []
 
     # Dict mapping domains to index in original_form.ufl_domains()
@@ -159,8 +160,7 @@ def compile_integral(integral_data, form_data, prefix, parameters,
         ir = fem.compile_ufl(integrand, interior_facet=interior_facet, **config)
         if parameters["unroll_indexsum"]:
             ir = opt.unroll_indexsum(ir, max_extent=parameters["unroll_indexsum"])
-        for quadrature_index in quadrature_multiindex:
-            ir = [gem.index_sum(expr, quadrature_index) for expr in ir]
+        ir = [gem.index_sum(expr, quadrature_multiindex) for expr in ir]
         irs.append(ir)
 
     # Sum the expressions that are part of the same restriction
@@ -175,7 +175,7 @@ def compile_integral(integral_data, form_data, prefix, parameters,
         builder.require_cell_orientations()
 
     impero_c = impero_utils.compile_gem(return_variables, ir,
-                                        tuple(quadrature_indices) + tuple(chain(*argument_indices)),
+                                        tuple(quadrature_indices) + flat_argument_indices,
                                         remove_zeros=True)
 
     # Generate COFFEE
@@ -188,7 +188,7 @@ def compile_integral(integral_data, form_data, prefix, parameters,
         for i, quadrature_index in enumerate(quadrature_indices):
             index_names.append((quadrature_index, 'ip_%d' % i))
 
-    body = generate_coffee(impero_c, index_names, parameters["precision"], ir, tuple(chain(*argument_indices)))
+    body = generate_coffee(impero_c, index_names, parameters["precision"], ir, flat_argument_indices)
 
     kernel_name = "%s_%s_integral_%s" % (prefix, integral_type, integral_data.subdomain_id)
     return builder.construct_kernel(kernel_name, body)
diff --git a/tsfc/fem.py b/tsfc/fem.py
index b3b70104..7aff1cb0 100644
--- a/tsfc/fem.py
+++ b/tsfc/fem.py
@@ -217,9 +217,7 @@ def callback(entity_id):
     vi = tuple(gem.Index(extent=d) for d in mt.expr.ufl_shape)
     result = gem.Product(gem.Indexed(M, alpha + vi),
                          gem.Indexed(vec, alpha))
-    for i in alpha:
-        result = gem.IndexSum(result, i)
-    result = gem.optimise.contraction(result)
+    result = gem.optimise.contraction(gem.IndexSum(result, alpha))
     if vi:
         return gem.ComponentTensor(result, vi)
     else:
@@ -242,6 +240,5 @@ def compile_ufl(expression, interior_facet=False, point_sum=False, **kwargs):
     translator = Translator(context)
     result = map_expr_dags(translator, expressions)
     if point_sum:
-        for index in context.point_set.indices:
-            result = [gem.index_sum(expr, index) for expr in result]
+        result = [gem.index_sum(expr, context.point_set.indices) for expr in result]
     return result
diff --git a/tsfc/ufl2gem.py b/tsfc/ufl2gem.py
index 673d39c6..79ff09df 100644
--- a/tsfc/ufl2gem.py
+++ b/tsfc/ufl2gem.py
@@ -112,9 +112,9 @@ def index_sum(self, o, summand, indices):
 
         if o.ufl_shape:
             indices = tuple(Index() for i in range(len(o.ufl_shape)))
-            return ComponentTensor(IndexSum(Indexed(summand, indices), index), indices)
+            return ComponentTensor(IndexSum(Indexed(summand, indices), (index,)), indices)
         else:
-            return IndexSum(summand, index)
+            return IndexSum(summand, (index,))
 
     def variable(self, o, expression, label):
         # Only used by UFL AD, at this point, the bare expression is

From 7e24f0d949591720d7610822d404e5799eeef7c2 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Wed, 16 Nov 2016 16:08:44 +0000
Subject: [PATCH 05/18] little clean up

---
 gem/optimise.py |  2 +-
 tsfc/fem.py     | 10 ++--------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index 7784e2ba..82079999 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -367,7 +367,7 @@ def expression(index):
 def replace_delta(expressions):
     """Lowers all Deltas in a multi-root expression DAG."""
     mapper = Memoizer(_replace_delta)
-    return map(mapper, expressions)
+    return list(map(mapper, expressions))
 
 
 @singledispatch
diff --git a/tsfc/fem.py b/tsfc/fem.py
index 7aff1cb0..365f82c3 100644
--- a/tsfc/fem.py
+++ b/tsfc/fem.py
@@ -189,10 +189,7 @@ def callback(entity_id):
     vi = tuple(gem.Index(extent=d) for d in mt.expr.ufl_shape)
     argument_index = ctx.argument_indices[terminal.number()]
     result = gem.Indexed(M, argument_index + vi)
-    if vi:
-        return gem.ComponentTensor(result, vi)
-    else:
-        return result
+    return gem.ComponentTensor(result, vi)
 
 
 @translate.register(Coefficient)
@@ -218,10 +215,7 @@ def callback(entity_id):
     result = gem.Product(gem.Indexed(M, alpha + vi),
                          gem.Indexed(vec, alpha))
     result = gem.optimise.contraction(gem.IndexSum(result, alpha))
-    if vi:
-        return gem.ComponentTensor(result, vi)
-    else:
-        return result
+    return gem.ComponentTensor(result, vi)
 
 
 def compile_ufl(expression, interior_facet=False, point_sum=False, **kwargs):

From 5c9c16902972c4a6b37a68420bfcac4dfd67a2fe Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Wed, 16 Nov 2016 16:35:10 +0000
Subject: [PATCH 06/18] add docstrings

---
 gem/optimise.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gem/optimise.py b/gem/optimise.py
index 82079999..51be1cea 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -197,6 +197,11 @@ def select_expression(expressions, index):
 
 @singledispatch
 def _pull_delta_from_listtensor(node, self):
+    """Pull common delta factors out of ListTensor entries.
+
+    :arg node: root of the expression
+    :arg self: function for recursive calls
+    """
     raise AssertionError("cannot handle type %s" % type(node))
 
 
@@ -246,11 +251,21 @@ def _pull_delta_from_listtensor_listtensor(node, self):
 
 
 def pull_delta_from_listtensor(expression):
+    """Pull common delta factors out of ListTensor entries."""
     mapper = Memoizer(_pull_delta_from_listtensor)
     return mapper(expression)
 
 
 def contraction(expression):
+    """Optimise the contractions of the tensor product at the root of
+    the expression, including:
+
+    - IndexSum-Delta cancellation
+    - Sum factorisation
+
+    This routine was designed with finite element coefficient
+    evaluation in mind.
+    """
     # Pull Delta nodes out of annoying ListTensors, and eliminate
     # annoying ComponentTensors
     expression, = remove_componenttensors([expression])

From 1485550ba337421c66322aca71c0f99e467d5696 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 21 Nov 2016 11:57:57 +0000
Subject: [PATCH 07/18] limit O(N!) algorithm to N <= 5

---
 gem/optimise.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index 51be1cea..e4521f30 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -308,21 +308,19 @@ def contraction(expression):
     factors = [e for e in factors if e != Literal(1)]
 
     # Sum factorisation
-    expression = None
-    best_flops = numpy.inf
-
-    for ordering in permutations(factors):
+    def construct(ordering):
+        """Construct tensor product from a given ordering."""
         deps = [set(sum_indices) & set(factor.free_indices)
                 for factor in ordering]
 
-        scan_deps = [None] * len(factors)
+        scan_deps = [None] * len(ordering)
         scan_deps[0] = deps[0]
-        for i in range(1, len(factors)):
+        for i in range(1, len(ordering)):
             scan_deps[i] = scan_deps[i - 1] | deps[i]
 
-        sum_at = [None] * len(factors)
+        sum_at = [None] * len(ordering)
         sum_at[0] = scan_deps[0]
-        for i in range(1, len(factors)):
+        for i in range(1, len(ordering)):
             sum_at[i] = scan_deps[i] - scan_deps[i - 1]
 
         expr = None
@@ -336,10 +334,25 @@ def contraction(expression):
             if s:
                 flops += numpy.prod([i.extent for i in s])
             expr = IndexSum(expr, tuple(i for i in sum_indices if i in s))
+        return expr, flops
+
+    if len(factors) <= 5:
+        expression = None
+        best_flops = numpy.inf
+
+        for ordering in permutations(factors):
+            expr, flops = construct(ordering)
+            if flops < best_flops:
+                expression = expr
+                best_flops = flops
+    else:
+        # Cheap heuristic
+        def key(factor):
+            return len(set(sum_indices) & set(factor.free_indices))
+        ordering = sorted(factors, key=key)
 
-        if flops < best_flops:
-            expression = expr
-            best_flops = flops
+        # FIXME: Log this unexpected case.
+        expression, flops = construct(ordering)
 
     return expression
 

From 241883447cf18d9798784d286f4487347ffc7e37 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 21 Nov 2016 13:00:39 +0000
Subject: [PATCH 08/18] log warning for unexpected case

---
 gem/optimise.py | 6 ++++--
 tsfc/fem.py     | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index e4521f30..447d8029 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -256,7 +256,7 @@ def pull_delta_from_listtensor(expression):
     return mapper(expression)
 
 
-def contraction(expression):
+def contraction(expression, logger=None):
     """Optimise the contractions of the tensor product at the root of
     the expression, including:
 
@@ -347,11 +347,13 @@ def construct(ordering):
                 best_flops = flops
     else:
         # Cheap heuristic
+        logger.warning("Unexpectedly many terms for sum factorisation: %d"
+                       "; falling back on cheap heuristic.", len(factors))
+
         def key(factor):
             return len(set(sum_indices) & set(factor.free_indices))
         ordering = sorted(factors, key=key)
 
-        # FIXME: Log this unexpected case.
         expression, flops = construct(ordering)
 
     return expression
diff --git a/tsfc/fem.py b/tsfc/fem.py
index 365f82c3..cc10cc05 100644
--- a/tsfc/fem.py
+++ b/tsfc/fem.py
@@ -20,6 +20,7 @@
 from tsfc import ufl2gem, geometric
 from tsfc.finatinterface import create_element, as_fiat_cell
 from tsfc.kernel_interface import ProxyKernelInterface
+from tsfc.logging import logger
 from tsfc.modified_terminals import analyse_modified_terminal
 from tsfc.parameters import PARAMETERS
 from tsfc.ufl_utils import ModifiedTerminalMixin, PickRestriction, simplify_abs
@@ -214,7 +215,7 @@ def callback(entity_id):
     vi = tuple(gem.Index(extent=d) for d in mt.expr.ufl_shape)
     result = gem.Product(gem.Indexed(M, alpha + vi),
                          gem.Indexed(vec, alpha))
-    result = gem.optimise.contraction(gem.IndexSum(result, alpha))
+    result = gem.optimise.contraction(gem.IndexSum(result, alpha), logger=logger)
     return gem.ComponentTensor(result, vi)
 
 

From e38596951845a787ea1afcbeea095d65caf06d11 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 21 Nov 2016 13:15:09 +0000
Subject: [PATCH 09/18] add more comments

---
 gem/impero.py   | 4 ++++
 gem/optimise.py | 5 +++++
 2 files changed, 9 insertions(+)

diff --git a/gem/impero.py b/gem/impero.py
index b990aa99..3eee267e 100644
--- a/gem/impero.py
+++ b/gem/impero.py
@@ -147,8 +147,12 @@ class For(Node):
     __front__ = ('index',)
 
     def __new__(cls, index, statement):
+        # In case of an empty loop, create a Noop instead.
+        # Related: https://github.com/coneoproject/COFFEE/issues/98
         assert isinstance(statement, Block)
         if not statement.children:
+            # This "works" because the loop_shape of this node is not
+            # asked any more.
             return Noop(None)
         else:
             return super(For, cls).__new__(cls)
diff --git a/gem/optimise.py b/gem/optimise.py
index 447d8029..15b18172 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -310,19 +310,24 @@ def contraction(expression, logger=None):
     # Sum factorisation
     def construct(ordering):
         """Construct tensor product from a given ordering."""
+        # deps: Indices for each term that need to be summed over.
         deps = [set(sum_indices) & set(factor.free_indices)
                 for factor in ordering]
 
+        # scan_deps: Scan deps to the right with union operation.
         scan_deps = [None] * len(ordering)
         scan_deps[0] = deps[0]
         for i in range(1, len(ordering)):
             scan_deps[i] = scan_deps[i - 1] | deps[i]
 
+        # sum_at: What IndexSum nodes should be inserted before each
+        # term.  An IndexSum binds all terms to its right.
         sum_at = [None] * len(ordering)
         sum_at[0] = scan_deps[0]
         for i in range(1, len(ordering)):
             sum_at[i] = scan_deps[i] - scan_deps[i - 1]
 
+        # Construct expression and count floating-point operations
         expr = None
         flops = 0
         for s, f in reversed(list(zip(sum_at, ordering))):

From 48875f0a31c168493caecf8d8998bd0d42624f68 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Thu, 24 Nov 2016 16:36:33 +0000
Subject: [PATCH 10/18] revise coefficient evaluation

Seems to fix sum factorisation for coefficient derivatives.
---
 gem/gem.py  |   4 ++
 tsfc/fem.py | 115 ++++++++++++++++++++++++++++++++++------------------
 2 files changed, 79 insertions(+), 40 deletions(-)

diff --git a/gem/gem.py b/gem/gem.py
index 1713fc68..c298261a 100644
--- a/gem/gem.py
+++ b/gem/gem.py
@@ -448,6 +448,10 @@ def __new__(cls, aggregate, multiindex):
             if isinstance(index, Index):
                 index.set_extent(extent)
 
+        # Empty multiindex
+        if not multiindex:
+            return aggregate
+
         # Zero folding
         if isinstance(aggregate, Zero):
             return Zero()
diff --git a/tsfc/fem.py b/tsfc/fem.py
index cc10cc05..60cf3a9c 100644
--- a/tsfc/fem.py
+++ b/tsfc/fem.py
@@ -1,9 +1,11 @@
 from __future__ import absolute_import, print_function, division
+from six import iterkeys, iteritems, itervalues
 from six.moves import map
 
 import collections
 import itertools
 
+import numpy
 from singledispatch import singledispatch
 
 from ufl.corealg.map_dag import map_expr_dag, map_expr_dags
@@ -151,46 +153,51 @@ def translate_facetarea(terminal, mt, ctx):
     return ctx.facetarea()
 
 
-def basis_evaluation(element, ps, derivative=0, entity=None, epsilon=0.0):
-    # TODO: clean up, potentially remove this function.
-    import numpy
+def fiat_to_ufl(fiat_dict, order):
+    # All derivative multiindices must be of the same dimension.
+    dimension, = list(set(len(alpha) for alpha in iterkeys(fiat_dict)))
 
-    finat_result = element.basis_evaluation(derivative, ps, entity)
-    i = element.get_indices()
-    vi = element.get_value_indices()
+    # All derivative tables must have the same shape.
+    shape, = list(set(table.shape for table in itervalues(fiat_dict)))
+    sigma = tuple(gem.Index(extent=extent) for extent in shape)
 
-    dimension = element.cell.get_spatial_dimension()
+    # Convert from FIAT to UFL format
     eye = numpy.eye(dimension, dtype=int)
-    tensor = numpy.empty((dimension,) * derivative, dtype=object)
+    tensor = numpy.empty((dimension,) * order, dtype=object)
     for multiindex in numpy.ndindex(tensor.shape):
         alpha = tuple(eye[multiindex, :].sum(axis=0))
-        tensor[multiindex] = gem.Indexed(finat_result[alpha], i + vi)
-    di = tuple(gem.Index(extent=dimension) for _ in range(derivative))
-    if derivative:
-        tensor = gem.Indexed(gem.ListTensor(tensor), di)
+        tensor[multiindex] = gem.Indexed(fiat_dict[alpha], sigma)
+    delta = tuple(gem.Index(extent=dimension) for _ in range(order))
+    if order > 0:
+        tensor = gem.Indexed(gem.ListTensor(tensor), delta)
     else:
         tensor = tensor[()]
-    # A numerical hack that FFC used to apply on FIAT tables still
-    # lives on after ditching FFC and switching to FInAT.
-    tensor = ffc_rounding(tensor, epsilon)
-    return gem.ComponentTensor(tensor, i + vi + di)
+    return gem.ComponentTensor(tensor, sigma + delta)
 
 
 @translate.register(Argument)
 def translate_argument(terminal, mt, ctx):
+    argument_index = ctx.argument_indices[terminal.number()]
+    sigma = tuple(gem.Index(extent=d) for d in mt.expr.ufl_shape)
     element = create_element(terminal.ufl_element())
 
     def callback(entity_id):
-        return basis_evaluation(element,
-                                ctx.point_set,
-                                derivative=mt.local_derivatives,
-                                entity=(ctx.integration_dim, entity_id),
-                                epsilon=ctx.epsilon)
-    M = ctx.entity_selector(callback, mt.restriction)
-    vi = tuple(gem.Index(extent=d) for d in mt.expr.ufl_shape)
-    argument_index = ctx.argument_indices[terminal.number()]
-    result = gem.Indexed(M, argument_index + vi)
-    return gem.ComponentTensor(result, vi)
+        finat_dict = element.basis_evaluation(mt.local_derivatives,
+                                              ctx.point_set,
+                                              (ctx.integration_dim, entity_id))
+        # Filter out irrelevant derivatives
+        filtered_dict = {alpha: table
+                         for alpha, table in iteritems(finat_dict)
+                         if sum(alpha) == mt.local_derivatives}
+
+        # Change from FIAT to UFL arrangement
+        square = fiat_to_ufl(filtered_dict, mt.local_derivatives)
+
+        # A numerical hack that FFC used to apply on FIAT tables still
+        # lives on after ditching FFC and switching to FInAT.
+        return ffc_rounding(square, ctx.epsilon)
+    table = ctx.entity_selector(callback, mt.restriction)
+    return gem.ComponentTensor(gem.Indexed(table, argument_index + sigma), sigma)
 
 
 @translate.register(Coefficient)
@@ -203,20 +210,48 @@ def translate_coefficient(terminal, mt, ctx):
 
     element = create_element(terminal.ufl_element())
 
-    def callback(entity_id):
-        return basis_evaluation(element,
-                                ctx.point_set,
-                                derivative=mt.local_derivatives,
-                                entity=(ctx.integration_dim, entity_id),
-                                epsilon=ctx.epsilon)
-    M = ctx.entity_selector(callback, mt.restriction)
-
-    alpha = element.get_indices()
-    vi = tuple(gem.Index(extent=d) for d in mt.expr.ufl_shape)
-    result = gem.Product(gem.Indexed(M, alpha + vi),
-                         gem.Indexed(vec, alpha))
-    result = gem.optimise.contraction(gem.IndexSum(result, alpha), logger=logger)
-    return gem.ComponentTensor(result, vi)
+    # Collect FInAT tabulation for all entities
+    per_derivative = collections.defaultdict(list)
+    for entity_id in ctx.entity_ids:
+        finat_dict = element.basis_evaluation(mt.local_derivatives,
+                                              ctx.point_set,
+                                              (ctx.integration_dim, entity_id))
+        for alpha, table in iteritems(finat_dict):
+            # Filter out irrelevant derivatives
+            if sum(alpha) == mt.local_derivatives:
+                # A numerical hack that FFC used to apply on FIAT
+                # tables still lives on after ditching FFC and
+                # switching to FInAT.
+                table = ffc_rounding(table, ctx.epsilon)
+                per_derivative[alpha].append(table)
+
+    # Merge entity tabulations for each derivative
+    if len(ctx.entity_ids) == 1:
+        def take_singleton(xs):
+            x, = xs  # asserts singleton
+            return x
+        per_derivative = {alpha: take_singleton(tables)
+                          for alpha, tables in iteritems(per_derivative)}
+    else:
+        f = ctx.facet_number(mt.restriction)
+        per_derivative = {alpha: gem.select_expression(tables, f)
+                          for alpha, tables in iteritems(per_derivative)}
+
+    # Coefficient evaluation
+    beta = element.get_indices()
+    zeta = element.get_value_indices()
+    value_dict = {}
+    for alpha, table in iteritems(per_derivative):
+        value = gem.IndexSum(gem.Product(gem.Indexed(table, beta + zeta),
+                                         gem.Indexed(vec, beta)),
+                             beta)
+        optimised_value = gem.optimise.contraction(value, logger=logger)
+        value_dict[alpha] = gem.ComponentTensor(optimised_value, zeta)
+
+    # Change from FIAT to UFL arrangement
+    result = fiat_to_ufl(value_dict, mt.local_derivatives)
+    assert result.shape == mt.expr.ufl_shape
+    return result
 
 
 def compile_ufl(expression, interior_facet=False, point_sum=False, **kwargs):

From a65bdfb9f145a5cd6793e4b893cef03ee2b53a18 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Thu, 24 Nov 2016 11:03:28 +0000
Subject: [PATCH 11/18] improve constant folding

---
 gem/gem.py | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/gem/gem.py b/gem/gem.py
index c298261a..a5c1ec59 100644
--- a/gem/gem.py
+++ b/gem/gem.py
@@ -200,12 +200,15 @@ def __new__(cls, a, b):
         assert not a.shape
         assert not b.shape
 
-        # Zero folding
+        # Constant folding
         if isinstance(a, Zero):
             return b
         elif isinstance(b, Zero):
             return a
 
+        if isinstance(a, Constant) and isinstance(b, Constant):
+            return Literal(a.value + b.value)
+
         self = super(Sum, cls).__new__(cls)
         self.children = a, b
         return self
@@ -218,10 +221,18 @@ def __new__(cls, a, b):
         assert not a.shape
         assert not b.shape
 
-        # Zero folding
+        # Constant folding
         if isinstance(a, Zero) or isinstance(b, Zero):
             return Zero()
 
+        if a == one:
+            return b
+        if b == one:
+            return a
+
+        if isinstance(a, Constant) and isinstance(b, Constant):
+            return Literal(a.value * b.value)
+
         self = super(Product, cls).__new__(cls)
         self.children = a, b
         return self
@@ -234,12 +245,18 @@ def __new__(cls, a, b):
         assert not a.shape
         assert not b.shape
 
-        # Zero folding
+        # Constant folding
         if isinstance(b, Zero):
             raise ValueError("division by zero")
         if isinstance(a, Zero):
             return Zero()
 
+        if b == one:
+            return a
+
+        if isinstance(a, Constant) and isinstance(b, Constant):
+            return Literal(a.value / b.value)
+
         self = super(Division, cls).__new__(cls)
         self.children = a, b
         return self
@@ -258,7 +275,7 @@ def __new__(cls, base, exponent):
                 raise ValueError("cannot solve 0^0")
             return Zero()
         elif isinstance(exponent, Zero):
-            return Literal(1)
+            return one
 
         self = super(Power, cls).__new__(cls)
         self.children = base, exponent
@@ -646,7 +663,7 @@ def __new__(cls, i, j):
 
         # \delta_{i,i} = 1
         if i == j:
-            return Literal(1)
+            return one
 
         # Fixed indices
         if isinstance(i, int) and isinstance(j, int):
@@ -768,3 +785,7 @@ def reshape(variable, *shapes):
         dim2idxs.append((0, tuple(idxs)))
     expr = FlexiblyIndexed(variable, tuple(dim2idxs))
     return ComponentTensor(expr, tuple(indices))
+
+
+# Static one object for quicker constant folding
+one = Literal(1)

From b0fe22acf63d18716bb3f0e7613dfe4f7aa59d8f Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Thu, 24 Nov 2016 11:45:53 +0000
Subject: [PATCH 12/18] generate fast Jacobian code snippets

---
 gem/optimise.py | 16 ++++++++++++++++
 tsfc/fem.py     |  8 ++++++++
 2 files changed, 24 insertions(+)

diff --git a/gem/optimise.py b/gem/optimise.py
index 15b18172..b32a2d3a 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -446,3 +446,19 @@ def unroll_indexsum(expressions, max_extent):
     mapper = Memoizer(_unroll_indexsum)
     mapper.max_extent = max_extent
     return list(map(mapper, expressions))
+
+
+def aggressive_unroll(expression):
+    """Aggressively unrolls all loop structures."""
+    # Unroll expression shape
+    if expression.shape:
+        tensor = numpy.empty(expression.shape, dtype=object)
+        for alpha in numpy.ndindex(expression.shape):
+            tensor[alpha] = Indexed(expression, alpha)
+        expression = ListTensor(tensor)
+        expression, = remove_componenttensors((ListTensor(tensor),))
+
+    # Unroll summation
+    expression, = unroll_indexsum((expression,), max_extent=numpy.inf)
+    expression, = remove_componenttensors((expression,))
+    return expression
diff --git a/tsfc/fem.py b/tsfc/fem.py
index 60cf3a9c..545da67c 100644
--- a/tsfc/fem.py
+++ b/tsfc/fem.py
@@ -14,6 +14,7 @@
                          GeometricQuantity, QuadratureWeight)
 
 import gem
+from gem.node import traversal
 from gem.optimise import ffc_rounding
 from gem.utils import cached_property
 
@@ -251,6 +252,13 @@ def take_singleton(xs):
     # Change from FIAT to UFL arrangement
     result = fiat_to_ufl(value_dict, mt.local_derivatives)
     assert result.shape == mt.expr.ufl_shape
+    assert set(result.free_indices) <= set(ctx.point_set.indices)
+
+    # Detect Jacobian of affine cells
+    if not result.free_indices and all(numpy.count_nonzero(node.array) <= 2
+                                       for node in traversal((result,))
+                                       if isinstance(node, gem.Literal)):
+        result = gem.optimise.aggressive_unroll(result)
     return result
 
 

From 629b3929903bc748133f6449d4eeed29ce41fc2a Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Thu, 24 Nov 2016 17:13:40 +0000
Subject: [PATCH 13/18] remove Delta * ListTensor factorisation

Not need any more.
---
 gem/optimise.py | 67 +------------------------------------------------
 1 file changed, 1 insertion(+), 66 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index b32a2d3a..4cf8e930 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -17,7 +17,6 @@
                      VariableIndex, Indexed, FlexiblyIndexed,
                      IndexSum, ComponentTensor, ListTensor, Delta,
                      partial_indexed)
-from gem.utils import OrderedSet
 
 
 @singledispatch
@@ -195,67 +194,6 @@ def select_expression(expressions, index):
     return ComponentTensor(selected, alpha)
 
 
-@singledispatch
-def _pull_delta_from_listtensor(node, self):
-    """Pull common delta factors out of ListTensor entries.
-
-    :arg node: root of the expression
-    :arg self: function for recursive calls
-    """
-    raise AssertionError("cannot handle type %s" % type(node))
-
-
-_pull_delta_from_listtensor.register(Node)(reuse_if_untouched)
-
-
-@_pull_delta_from_listtensor.register(ListTensor)
-def _pull_delta_from_listtensor_listtensor(node, self):
-    # Separate Delta nodes from other expressions
-    deltaz = []
-    rests = []
-
-    for child in node.children:
-        deltas = OrderedSet()
-        others = []
-
-        # Traverse Product tree
-        queue = deque([child])
-        while queue:
-            expr = queue.popleft()
-            if isinstance(expr, Product):
-                queue.extend(expr.children)
-            elif isinstance(expr, Delta):
-                assert expr not in deltas
-                deltas.add(expr)
-            else:
-                others.append(self(expr))  # looks for more ListTensors inside
-
-        deltaz.append(deltas)
-        rests.append(reduce(Product, others))
-
-    # Factor out common Delta factors
-    common_deltas = set.intersection(*[set(ds) for ds in deltaz])
-    deltaz = [[d for d in ds if d not in common_deltas] for ds in deltaz]
-
-    # Rebuild ListTensor
-    new_children = [reduce(Product, ds, rest)
-                    for ds, rest in zip(deltaz, rests)]
-    result = node.reconstruct(*new_children)
-
-    # Apply common Delta factors
-    if common_deltas:
-        alpha = tuple(Index(extent=d) for d in result.shape)
-        expr = reduce(Product, common_deltas, Indexed(result, alpha))
-        result = ComponentTensor(expr, alpha)
-    return result
-
-
-def pull_delta_from_listtensor(expression):
-    """Pull common delta factors out of ListTensor entries."""
-    mapper = Memoizer(_pull_delta_from_listtensor)
-    return mapper(expression)
-
-
 def contraction(expression, logger=None):
     """Optimise the contractions of the tensor product at the root of
     the expression, including:
@@ -266,10 +204,7 @@ def contraction(expression, logger=None):
     This routine was designed with finite element coefficient
     evaluation in mind.
     """
-    # Pull Delta nodes out of annoying ListTensors, and eliminate
-    # annoying ComponentTensors
-    expression, = remove_componenttensors([expression])
-    expression = pull_delta_from_listtensor(expression)
+    # Eliminate annoying ComponentTensors
     expression, = remove_componenttensors([expression])
 
     # Flatten a product tree

From f78545be32a65563b7fac9b2283c14915a3541ba Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Fri, 25 Nov 2016 11:23:44 +0000
Subject: [PATCH 14/18] use one in gem.optimise

---
 gem/optimise.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index 4cf8e930..d1b6a13b 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -16,7 +16,7 @@
                      Product, Sum, Comparison, Conditional, Index,
                      VariableIndex, Indexed, FlexiblyIndexed,
                      IndexSum, ComponentTensor, ListTensor, Delta,
-                     partial_indexed)
+                     partial_indexed, one)
 
 
 @singledispatch
@@ -240,7 +240,7 @@ def contraction(expression, logger=None):
                        for index in (f.i, f.j) if index in sum_indices]
 
     # Drop ones
-    factors = [e for e in factors if e != Literal(1)]
+    factors = [e for e in factors if e != one]
 
     # Sum factorisation
     def construct(ordering):
@@ -331,7 +331,7 @@ def expression(index):
                 raise ValueError("Cannot convert running index to expression.")
         e_i = expression(i)
         e_j = expression(j)
-        return Conditional(Comparison("==", e_i, e_j), Literal(1), Zero())
+        return Conditional(Comparison("==", e_i, e_j), one, Zero())
 
 
 def replace_delta(expressions):

From e6aec8faa9e3b53765585dafeb5d4f4ef36ac6b7 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Fri, 25 Nov 2016 12:08:09 +0000
Subject: [PATCH 15/18] replace sum factorisation algorithm

---
 gem/optimise.py | 83 ++++++++++++++++++-------------------------------
 1 file changed, 30 insertions(+), 53 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index d1b6a13b..b5d37346 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -2,9 +2,10 @@
 expressions."""
 
 from __future__ import absolute_import, print_function, division
-from six.moves import map, range, zip
+from six import itervalues
+from six.moves import map, zip
 
-from collections import deque
+from collections import OrderedDict, deque
 from functools import reduce
 from itertools import permutations
 
@@ -242,59 +243,35 @@ def contraction(expression, logger=None):
     # Drop ones
     factors = [e for e in factors if e != one]
 
-    # Sum factorisation
-    def construct(ordering):
-        """Construct tensor product from a given ordering."""
-        # deps: Indices for each term that need to be summed over.
-        deps = [set(sum_indices) & set(factor.free_indices)
-                for factor in ordering]
-
-        # scan_deps: Scan deps to the right with union operation.
-        scan_deps = [None] * len(ordering)
-        scan_deps[0] = deps[0]
-        for i in range(1, len(ordering)):
-            scan_deps[i] = scan_deps[i - 1] | deps[i]
-
-        # sum_at: What IndexSum nodes should be inserted before each
-        # term.  An IndexSum binds all terms to its right.
-        sum_at = [None] * len(ordering)
-        sum_at[0] = scan_deps[0]
-        for i in range(1, len(ordering)):
-            sum_at[i] = scan_deps[i] - scan_deps[i - 1]
-
-        # Construct expression and count floating-point operations
-        expr = None
-        flops = 0
-        for s, f in reversed(list(zip(sum_at, ordering))):
-            if expr is None:
-                expr = f
-            else:
-                expr = Product(f, expr)
-                flops += numpy.prod([i.extent for i in expr.free_indices], dtype=int)
-            if s:
-                flops += numpy.prod([i.extent for i in s])
-            expr = IndexSum(expr, tuple(i for i in sum_indices if i in s))
-        return expr, flops
-
-    if len(factors) <= 5:
-        expression = None
-        best_flops = numpy.inf
-
-        for ordering in permutations(factors):
-            expr, flops = construct(ordering)
-            if flops < best_flops:
-                expression = expr
-                best_flops = flops
-    else:
-        # Cheap heuristic
-        logger.warning("Unexpectedly many terms for sum factorisation: %d"
-                       "; falling back on cheap heuristic.", len(factors))
+    # Form groups by free indices
+    groups = OrderedDict()
+    for factor in factors:
+        groups[factor.free_indices] = []
+    for factor in factors:
+        groups[factor.free_indices].append(factor)
+    groups = [reduce(Product, terms) for terms in itervalues(groups)]
 
-        def key(factor):
-            return len(set(sum_indices) & set(factor.free_indices))
-        ordering = sorted(factors, key=key)
+    # Sum factorisation
+    expression = None
+    best_flops = numpy.inf
 
-        expression, flops = construct(ordering)
+    for ordering in permutations(sum_indices):
+        terms = groups[:]
+        flops = 0
+        for sum_index in ordering:
+            contract = [t for t in terms if sum_index in t.free_indices]
+            deferred = [t for t in terms if sum_index not in t.free_indices]
+
+            product = reduce(Product, contract)
+            term = IndexSum(product, (sum_index,))
+            flops += len(contract) * numpy.prod([i.extent for i in product.free_indices], dtype=int)
+            terms = deferred + [term]
+        expr = reduce(Product, terms)
+        flops += (len(terms) - 1) * numpy.prod([i.extent for i in expr.free_indices], dtype=int)
+
+        if flops < best_flops:
+            expression = expr
+            best_flops = flops
 
     return expression
 

From ac430874c5548c8551233fdece79efc642454cd6 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Fri, 25 Nov 2016 15:34:14 +0000
Subject: [PATCH 16/18] clean up sum factorisation

---
 gem/optimise.py | 90 ++++++++++++++++++++++++++++++++++---------------
 tsfc/fem.py     |  3 +-
 2 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index b5d37346..05c1baa4 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -195,35 +195,15 @@ def select_expression(expressions, index):
     return ComponentTensor(selected, alpha)
 
 
-def contraction(expression, logger=None):
-    """Optimise the contractions of the tensor product at the root of
-    the expression, including:
-
-    - IndexSum-Delta cancellation
-    - Sum factorisation
+def delta_elimination(sum_indices, factors):
+    """IndexSum-Delta cancellation.
 
-    This routine was designed with finite element coefficient
-    evaluation in mind.
+    :arg sum_indices: free indices for contractions
+    :arg factors: product factors
+    :returns: optimised (sum_indices, factors)
     """
-    # Eliminate annoying ComponentTensors
-    expression, = remove_componenttensors([expression])
-
-    # Flatten a product tree
-    sum_indices = []
-    factors = []
-
-    queue = deque([expression])
-    while queue:
-        expr = queue.popleft()
-        if isinstance(expr, IndexSum):
-            queue.append(expr.children[0])
-            sum_indices.extend(expr.multiindex)
-        elif isinstance(expr, Product):
-            queue.extend(expr.children)
-        else:
-            factors.append(expr)
+    sum_indices = list(sum_indices)  # copy for modification
 
-    # Try to eliminate Delta nodes
     delta_queue = [(f, index)
                    for f in factors if isinstance(f, Delta)
                    for index in (f.i, f.j) if index in sum_indices]
@@ -241,7 +221,18 @@ def contraction(expression, logger=None):
                        for index in (f.i, f.j) if index in sum_indices]
 
     # Drop ones
-    factors = [e for e in factors if e != one]
+    return sum_indices, [e for e in factors if e != one]
+
+
+def sum_factorise(sum_indices, factors):
+    """Optimise a tensor product throw sum factorisation.
+
+    :arg sum_indices: free indices for contractions
+    :arg factors: product factors
+    :returns: optimised GEM expression
+    """
+    if len(sum_indices) > 5:
+        raise NotImplementedError("Too many indices for sum factorisation!")
 
     # Form groups by free indices
     groups = OrderedDict()
@@ -255,17 +246,31 @@ def contraction(expression, logger=None):
     expression = None
     best_flops = numpy.inf
 
+    # Consider all orderings of contraction indices
     for ordering in permutations(sum_indices):
         terms = groups[:]
         flops = 0
+        # Apply contraction index by index
         for sum_index in ordering:
+            # Select terms that need to be part of the contraction
             contract = [t for t in terms if sum_index in t.free_indices]
             deferred = [t for t in terms if sum_index not in t.free_indices]
 
+            # A further optimisation opportunity is to consider
+            # various ways of building the product tree.
             product = reduce(Product, contract)
             term = IndexSum(product, (sum_index,))
+            # For the operation count estimation we assume that no
+            # operations were saved with the particular product tree
+            # that we built above.
             flops += len(contract) * numpy.prod([i.extent for i in product.free_indices], dtype=int)
+
+            # Replace the contracted terms with the result of the
+            # contraction.
             terms = deferred + [term]
+
+        # If some contraction indices were independent, then we may
+        # still have several terms at this point.
         expr = reduce(Product, terms)
         flops += (len(terms) - 1) * numpy.prod([i.extent for i in expr.free_indices], dtype=int)
 
@@ -276,6 +281,37 @@ def contraction(expression, logger=None):
     return expression
 
 
+def contraction(expression):
+    """Optimise the contractions of the tensor product at the root of
+    the expression, including:
+
+    - IndexSum-Delta cancellation
+    - Sum factorisation
+
+    This routine was designed with finite element coefficient
+    evaluation in mind.
+    """
+    # Eliminate annoying ComponentTensors
+    expression, = remove_componenttensors([expression])
+
+    # Flatten a product tree
+    sum_indices = []
+    factors = []
+
+    queue = deque([expression])
+    while queue:
+        expr = queue.popleft()
+        if isinstance(expr, IndexSum):
+            queue.append(expr.children[0])
+            sum_indices.extend(expr.multiindex)
+        elif isinstance(expr, Product):
+            queue.extend(expr.children)
+        else:
+            factors.append(expr)
+
+    return sum_factorise(*delta_elimination(sum_indices, factors))
+
+
 @singledispatch
 def _replace_delta(node, self):
     raise AssertionError("cannot handle type %s" % type(node))
diff --git a/tsfc/fem.py b/tsfc/fem.py
index 545da67c..e7ab97f7 100644
--- a/tsfc/fem.py
+++ b/tsfc/fem.py
@@ -23,7 +23,6 @@
 from tsfc import ufl2gem, geometric
 from tsfc.finatinterface import create_element, as_fiat_cell
 from tsfc.kernel_interface import ProxyKernelInterface
-from tsfc.logging import logger
 from tsfc.modified_terminals import analyse_modified_terminal
 from tsfc.parameters import PARAMETERS
 from tsfc.ufl_utils import ModifiedTerminalMixin, PickRestriction, simplify_abs
@@ -246,7 +245,7 @@ def take_singleton(xs):
         value = gem.IndexSum(gem.Product(gem.Indexed(table, beta + zeta),
                                          gem.Indexed(vec, beta)),
                              beta)
-        optimised_value = gem.optimise.contraction(value, logger=logger)
+        optimised_value = gem.optimise.contraction(value)
         value_dict[alpha] = gem.ComponentTensor(optimised_value, zeta)
 
     # Change from FIAT to UFL arrangement

From 4eea68d05f64a9551962fb89776693cc09e76b95 Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 28 Nov 2016 12:34:12 +0000
Subject: [PATCH 17/18] use setdefault

---
 gem/optimise.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index 05c1baa4..c8e9def1 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -237,9 +237,7 @@ def sum_factorise(sum_indices, factors):
     # Form groups by free indices
     groups = OrderedDict()
     for factor in factors:
-        groups[factor.free_indices] = []
-    for factor in factors:
-        groups[factor.free_indices].append(factor)
+        groups.setdefault(factor.free_indices, []).append(factor)
     groups = [reduce(Product, terms) for terms in itervalues(groups)]
 
     # Sum factorisation

From e73fceb24cffedc8e1faa9c7de7f3095133d39cd Mon Sep 17 00:00:00 2001
From: Miklos Homolya <m.homolya14@imperial.ac.uk>
Date: Mon, 28 Nov 2016 13:42:59 +0000
Subject: [PATCH 18/18] remove redundant line

---
 gem/optimise.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gem/optimise.py b/gem/optimise.py
index c8e9def1..58078d2b 100644
--- a/gem/optimise.py
+++ b/gem/optimise.py
@@ -401,7 +401,6 @@ def aggressive_unroll(expression):
         tensor = numpy.empty(expression.shape, dtype=object)
         for alpha in numpy.ndindex(expression.shape):
             tensor[alpha] = Indexed(expression, alpha)
-        expression = ListTensor(tensor)
         expression, = remove_componenttensors((ListTensor(tensor),))
 
     # Unroll summation