-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
VXLAN datapath is supported by the kernel thereby reducing the overhead of TUN device and userspace switching. This patch takes advantange of DOVE extensions and does not use multicast for port flooding. Fixes #18
- Loading branch information
Eugene Yakubovich
committed
Nov 4, 2014
1 parent
e89a829
commit 80d0086
Showing
10 changed files
with
742 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,287 @@ | ||
package vxlan | ||
|
||
import ( | ||
"fmt" | ||
"net" | ||
"os" | ||
"syscall" | ||
"time" | ||
|
||
log "github.com/coreos/flannel/Godeps/_workspace/src/github.com/golang/glog" | ||
"github.com/coreos/flannel/Godeps/_workspace/src/github.com/vishvananda/netlink" | ||
"github.com/coreos/flannel/Godeps/_workspace/src/github.com/vishvananda/netlink/nl" | ||
|
||
"github.com/coreos/flannel/pkg/ip" | ||
) | ||
|
||
type vxlanDeviceAttrs struct { | ||
vni uint32 | ||
name string | ||
vtepIndex int | ||
vtepAddr net.IP | ||
vtepPort int | ||
} | ||
|
||
type vxlanDevice struct { | ||
link *netlink.Vxlan | ||
} | ||
|
||
func sysctlSet(path, value string) error { | ||
f, err := os.Create(path) | ||
if err != nil { | ||
return err | ||
} | ||
defer f.Close() | ||
|
||
_, err = f.Write([]byte(value)) | ||
return err | ||
} | ||
|
||
func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) { | ||
link := &netlink.Vxlan{ | ||
LinkAttrs: netlink.LinkAttrs{ | ||
Name: devAttrs.name, | ||
}, | ||
VxlanId: int(devAttrs.vni), | ||
VtepDevIndex: devAttrs.vtepIndex, | ||
SrcAddr: devAttrs.vtepAddr, | ||
Port: devAttrs.vtepPort, | ||
Learning: true, | ||
Proxy: true, | ||
L2miss: true, | ||
} | ||
|
||
link, err := ensureLink(link) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
// this enables ARP requests being sent to userspace via netlink | ||
sysctlPath := fmt.Sprintf("/proc/sys/net/ipv4/neigh/%s/app_solicit", devAttrs.name) | ||
sysctlSet(sysctlPath, "3") | ||
|
||
return &vxlanDevice{ | ||
link: link, | ||
}, nil | ||
} | ||
|
||
func ensureLink(vxlan *netlink.Vxlan) (*netlink.Vxlan, error) { | ||
err := netlink.LinkAdd(vxlan) | ||
if err != nil { | ||
if err == syscall.EEXIST { | ||
// it's ok if the device already exists as long as config is similar | ||
existing, err := netlink.LinkByName(vxlan.Name) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if incompat := vxlanLinksIncompat(vxlan, existing); incompat != "" { | ||
log.Warningf("%q already exists with incompatable configuration: %v; recreating device", vxlan.Name, incompat) | ||
|
||
// delete existing | ||
if err = netlink.LinkDel(existing); err != nil { | ||
return nil, fmt.Errorf("failed to delete interface: %v", err) | ||
} | ||
|
||
// create new | ||
if err = netlink.LinkAdd(vxlan); err != nil { | ||
return nil, fmt.Errorf("failed to create vxlan interface: %v", err) | ||
} | ||
} else { | ||
return existing.(*netlink.Vxlan), nil | ||
} | ||
} else { | ||
return nil, err | ||
} | ||
} | ||
|
||
ifindex := vxlan.Index | ||
link, err := netlink.LinkByIndex(vxlan.Index) | ||
if err != nil { | ||
return nil, fmt.Errorf("can't locate created vxlan device with index %v", ifindex) | ||
} | ||
var ok bool | ||
if vxlan, ok = link.(*netlink.Vxlan); !ok { | ||
return nil, fmt.Errorf("created vxlan device with index %v is not vxlan", ifindex) | ||
} | ||
|
||
return vxlan, nil | ||
} | ||
|
||
func (dev *vxlanDevice) Configure(ipn ip.IP4Net) error { | ||
setAddr4(dev.link, ipn.ToIPNet()) | ||
|
||
if err := netlink.LinkSetUp(dev.link); err != nil { | ||
return fmt.Errorf("failed to set interface %s to UP state: %s", dev.link.Attrs().Name, err) | ||
} | ||
|
||
// explicitly add a route since there might be a route for a subnet already | ||
// installed by Docker and then it won't get auto added | ||
route := netlink.Route{ | ||
LinkIndex: dev.link.Attrs().Index, | ||
Scope: netlink.SCOPE_UNIVERSE, | ||
Dst: ipn.Network().ToIPNet(), | ||
} | ||
if err := netlink.RouteAdd(&route); err != nil && err != syscall.EEXIST { | ||
return fmt.Errorf("failed to add route (%s -> %s): %v", ipn.Network().String(), dev.link.Attrs().Name, err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func (dev *vxlanDevice) Destroy() { | ||
netlink.LinkDel(dev.link) | ||
} | ||
|
||
func (dev *vxlanDevice) MACAddr() net.HardwareAddr { | ||
return dev.link.HardwareAddr | ||
} | ||
|
||
func (dev *vxlanDevice) MTU() int { | ||
return dev.link.MTU | ||
} | ||
|
||
func (dev *vxlanDevice) MonitorMisses(misses chan *netlink.Neigh) { | ||
nlsock, err := nl.Subscribe(syscall.NETLINK_ROUTE, syscall.RTNLGRP_NEIGH) | ||
if err != nil { | ||
log.Error("Failed to subscribe to netlink RTNLGRP_NEIGH messages") | ||
return | ||
} | ||
|
||
for { | ||
msgs, err := nlsock.Recieve() | ||
if err != nil { | ||
log.Errorf("Failed to receive from netlink: %v ", err) | ||
|
||
// wait 1 sec before retrying but honor the cancel channel | ||
time.Sleep(1*time.Second) | ||
continue | ||
} | ||
|
||
for _, msg := range msgs { | ||
dev.processNeighMsg(msg, misses) | ||
} | ||
} | ||
} | ||
|
||
func isNeighResolving(state int) bool { | ||
return (state & (netlink.NUD_INCOMPLETE | netlink.NUD_STALE | netlink.NUD_DELAY | netlink.NUD_PROBE)) != 0 | ||
} | ||
|
||
func (dev *vxlanDevice) processNeighMsg(msg syscall.NetlinkMessage, misses chan *netlink.Neigh) { | ||
neigh, err := netlink.NeighDeserialize(msg.Data) | ||
if err != nil { | ||
log.Error("Failed to deserialize netlink ndmsg: %v", err) | ||
return | ||
} | ||
|
||
if int(neigh.LinkIndex) != dev.link.Index { | ||
return | ||
} | ||
|
||
if msg.Header.Type != syscall.RTM_GETNEIGH && msg.Header.Type != syscall.RTM_NEWNEIGH { | ||
return | ||
} | ||
|
||
if !isNeighResolving(neigh.State) { | ||
// misses come with NUD_STALE bit set | ||
return | ||
} | ||
|
||
misses <- neigh | ||
} | ||
|
||
func (dev *vxlanDevice) AddL2(mac net.HardwareAddr, vtep net.IP) error { | ||
neigh := netlink.Neigh{ | ||
LinkIndex: dev.link.Index, | ||
State: netlink.NUD_REACHABLE, | ||
Family: syscall.AF_BRIDGE, | ||
Flags: netlink.NTF_SELF, | ||
IP: vtep, | ||
HardwareAddr: mac, | ||
} | ||
|
||
log.Infof("calling NeighAdd: %v, %v", vtep, mac) | ||
return netlink.NeighAdd(&neigh) | ||
} | ||
|
||
func (dev *vxlanDevice) DelL2(mac net.HardwareAddr, vtep net.IP) error { | ||
neigh := netlink.Neigh{ | ||
LinkIndex: dev.link.Index, | ||
Family: syscall.AF_BRIDGE, | ||
Flags: netlink.NTF_SELF, | ||
IP: vtep, | ||
HardwareAddr: mac, | ||
} | ||
|
||
log.Infof("calling NeighDel: %v, %v", vtep, mac) | ||
return netlink.NeighDel(&neigh) | ||
} | ||
|
||
func (dev *vxlanDevice) AddL3(ip net.IP, mac net.HardwareAddr) error { | ||
neigh := netlink.Neigh{ | ||
LinkIndex: dev.link.Index, | ||
State: netlink.NUD_REACHABLE, | ||
Type: syscall.RTN_UNICAST, | ||
IP: ip, | ||
HardwareAddr: mac, | ||
} | ||
|
||
log.Infof("calling NeighSet: %v, %v", ip, mac) | ||
return netlink.NeighSet(&neigh) | ||
} | ||
|
||
func vxlanLinksIncompat(l1, l2 netlink.Link) string { | ||
if l1.Type() != l2.Type() { | ||
return fmt.Sprintf("link type: %v vs %v", l1.Type(), l2.Type()) | ||
} | ||
|
||
v1 := l1.(*netlink.Vxlan) | ||
v2 := l2.(*netlink.Vxlan) | ||
|
||
if v1.VxlanId != v2.VxlanId { | ||
return fmt.Sprintf("vni: %v vs %v", v1.VxlanId, v2.VxlanId) | ||
} | ||
|
||
if v1.VtepDevIndex > 0 && v2.VtepDevIndex > 0 && v1.VtepDevIndex != v2.VtepDevIndex { | ||
return fmt.Sprintf("vtep (external) interface: %v vs %v", v1.VtepDevIndex, v2.VtepDevIndex) | ||
} | ||
|
||
if len(v1.SrcAddr) > 0 && len(v2.SrcAddr) > 0 && !v1.SrcAddr.Equal(v2.SrcAddr) { | ||
return fmt.Sprintf("vtep (external) IP: %v vs %v", v1.SrcAddr, v2.SrcAddr) | ||
} | ||
|
||
if len(v1.Group) > 0 && len(v2.Group) > 0 && !v1.Group.Equal(v2.Group) { | ||
return fmt.Sprintf("group address: %v vs %v", v1.Group, v2.Group) | ||
} | ||
|
||
if v1.L2miss != v2.L2miss { | ||
return fmt.Sprintf("l2miss: %v vs %v", v1.L2miss, v2.L2miss) | ||
} | ||
|
||
if v1.Port > 0 && v2.Port > 0 && v1.Port != v2.Port { | ||
return fmt.Sprintf("port: %v vs %v", v1.Port, v2.Port) | ||
} | ||
|
||
return "" | ||
} | ||
|
||
// sets IP4 addr on link removing any existing ones first | ||
func setAddr4(link *netlink.Vxlan, ipn *net.IPNet) error { | ||
addrs, err := netlink.AddrList(link, syscall.AF_INET) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, addr := range addrs { | ||
if err = netlink.AddrDel(link, &addr); err != nil { | ||
return fmt.Errorf("failed to delete IPv4 addr %s from %s", addr.String(), link.Attrs().Name) | ||
} | ||
} | ||
|
||
addr := netlink.Addr{ ipn, "" } | ||
if err = netlink.AddrAdd(link, &addr); err != nil { | ||
return fmt.Errorf("failed to add IP address %s to %s: %s", ipn.String(), link.Attrs().Name, err) | ||
} | ||
|
||
return nil | ||
} |
Oops, something went wrong.