Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

【paddle.fleet】use the Floyd algorithm to find max path of meta optimizer #27867

Merged
merged 1 commit into from
Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 87 additions & 16 deletions python/paddle/distributed/fleet/base/strategy_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,95 @@
# limitations under the License.


def maximum_path_len_algo(optimizer_list):
max_idx = 0
max_len = 0
candidates = []
for idx, opt in enumerate(optimizer_list):
local_buffer = [opt]
for opt_inner in optimizer_list:
def create_graph(optimizer_list):
nsize = len(optimizer_list)

edge = [[0] * nsize for _ in range(nsize)] # adjacency matrix
indegree = [0] * nsize
for i, opt in enumerate(optimizer_list):
for j, opt_inner in enumerate(optimizer_list):
if opt._can_update(opt_inner):
local_buffer.append(opt_inner)
if len(local_buffer) > max_len:
max_idx = idx
max_len = len(local_buffer)
candidates.append(local_buffer)
if len(candidates) == 0:
edge[i][j] = 1 # weight
indegree[j] += 1

return edge, indegree


def topo_sort(edge, indegree):
nsize = len(indegree)

topo = [-1] * nsize
for i in range(nsize):
j = 0
while j < nsize and indegree[j] != 0:
j += 1
assert j < nsize, 'The combination of meta optimizers contains ring'

topo[i] = j
indegree[j] = -1
for k in range(nsize):
if edge[j][k] != 0:
indegree[k] -= 1

return topo


def floyd(edge):
nsize = len(edge)
max_len = -1
max_edge = [-1, -1]

max_path = [[[] for _ in range(nsize)] for _ in range(nsize)]
for i in range(nsize):
for j in range(nsize):
if edge[i][j] > 0:
max_path[i][j] = [j]

if edge[i][j] > max_len:
max_len = edge[i][j]
max_edge = [i, j]

# use floyd algorithm to find max_path
for k in range(nsize):
for i in range(nsize):
for j in range(nsize):
# if a-->b-->c, but a-/->c, can only apply a-->b or b-->c,
# however if a-->b-->c, and a-->c, can apply a->b->c
if edge[i][j] == 0:
continue

if edge[i][k] == 0 or edge[k][j] == 0:
continue

if edge[i][j] < edge[i][k] + edge[k][j]:
edge[i][j] = edge[i][k] + edge[k][j]
max_path[i][j] = max_path[i][k] + max_path[k][j]

max_len = edge[i][j]
max_edge = [i, j]

if max_len == -1:
return [0]

return [max_edge[0]] + max_path[max_edge[0]][max_edge[1]]


def maximum_path_len_algo(optimizer_list):
if len(optimizer_list) == 0:
return None
for idx, opt in enumerate(candidates[max_idx][:-1]):
opt._update_inner_optimizer(candidates[max_idx][idx + 1])
return candidates[max_idx]

edge, indegree = create_graph(optimizer_list)
topo_sort(edge, indegree)
max_path = floyd(edge)

candidate = []
for idx in max_path:
candidate.append(optimizer_list[idx])

for idx, opt in enumerate(candidate[:-1]):
opt._update_inner_optimizer(candidate[idx + 1])

return candidate


class StrategyCompilerBase(object):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,51 @@ def test_amp_recompute_optimizer(self):
# recompute
self.assertIn('subprog', ''.join(outs))

def test_amp_recompute_lars_optimizer(self):
""" test amp + recompute """
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'amp')
self.set_strategy(strategy, 'recompute')
self.set_strategy(strategy, 'lars')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)

strategy = fleet._final_strategy()

ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)

# recompute
self.assertIn('subprog', ''.join(outs))

# lars
self.assertIn('lars_momentum', ops)

def test_amp_recompute_lamb_optimizer(self):
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'amp')
self.set_strategy(strategy, 'recompute')
self.set_strategy(strategy, 'lamb')
self.optimizer(avg_cost, strategy, train_prog, startup_prog, 'adam')

ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)

# recompute
self.assertIn('subprog', ''.join(outs))

# lamb
self.assertIn('lamb', ops)


if __name__ == "__main__":
unittest.main()
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,36 @@ def test_dgc_recompute_optimizer(self):
# recompute
self.assertIn('subprog', ''.join(outs))

def test_amp_recompute_lars_dgc_not_apply_optimizer(self):
""" test amp + recompute + lars + dgc,
amp -/-> dgc, max_path is amp-->recompute-->lars
"""
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy = self.net(train_prog, startup_prog)
self.set_strategy(strategy, 'dgc')
self.set_strategy(strategy, 'amp')
self.set_strategy(strategy, 'recompute')
self.set_strategy(strategy, 'lars')
self.optimizer(avg_cost, strategy, train_prog, startup_prog)

strategy = fleet._final_strategy()

ops = [op.type for op in avg_cost.block.ops]
outs = [
op.output('Out')[0] for op in avg_cost.block.ops if op.type == 'mul'
]
self.assertIn('cast', ops)
self.assertIn('check_finite_and_unscale', ops)

# recompute
self.assertIn('subprog', ''.join(outs))

# lars
self.assertIn('lars_momentum', ops)

# dgc not apply
self.assertFalse(strategy.dgc)


if __name__ == "__main__":
unittest.main()