Attempt to fix Link vanishing after mastership change

- LLDPLinkProvider: handle DeviceUpdate event
- DeviceManager: publish Device events caused by Mastership change
- DeviceManager: Always try to markOffLine on deviceDisconnected
- GossipDeviceStore: Silently ignore failure to get Timestamp on port events

Change-Id: I51fbb3f1924007867512f20e62d6d53090c63640
This commit is contained in:
Yuta HIGUCHI 2014-10-26 19:34:20 -07:00
parent fd5cdf191f
commit eb24e9d0ac
5 changed files with 89 additions and 49 deletions

View File

@ -221,9 +221,15 @@ public class DeviceManager
log.info("Device {} connected", deviceId); log.info("Device {} connected", deviceId);
// check my Role // check my Role
MastershipRole role = mastershipService.requestRoleFor(deviceId); MastershipRole role = mastershipService.requestRoleFor(deviceId);
log.info("requestedRole, became {} for {}", role, deviceId);
if (role != MastershipRole.MASTER) { if (role != MastershipRole.MASTER) {
// TODO: Do we need to explicitly tell the Provider that // TODO: Do we need to explicitly tell the Provider that
// this instance is no longer the MASTER? probably not // this instance is no longer the MASTER? probably not
// Device device = getDevice(deviceId);
// if (device != null) {
// // FIXME roleChanged should take DeviceId instead of Device
// provider().roleChanged(device, role);
// }
return; return;
} }
MastershipTerm term = mastershipService.requestTermService() MastershipTerm term = mastershipService.requestTermService()
@ -231,6 +237,7 @@ public class DeviceManager
if (!term.master().equals(clusterService.getLocalNode().id())) { if (!term.master().equals(clusterService.getLocalNode().id())) {
// lost mastership after requestRole told this instance was MASTER. // lost mastership after requestRole told this instance was MASTER.
log.info("lost mastership before getting term info.");
return; return;
} }
@ -251,17 +258,13 @@ public class DeviceManager
// instance is the new Master, but // instance is the new Master, but
// event returned from the store is null? // event returned from the store is null?
// TODO: Confirm: Mastership could be lost after requestRole
// and createOrUpdateDevice call.
// In that case STANDBY node can
// claim itself to be master against the Device.
// Will the Node, chosen by the MastershipService, retry
// to get the MASTER role when that happen?
// FIXME: 1st argument should be deviceId, to allow setting // FIXME: 1st argument should be deviceId, to allow setting
// certain roles even if the store returned null. // certain roles even if the store returned null.
log.info("event: {} {}", event.type(), event);
provider().roleChanged(event.subject(), role); provider().roleChanged(event.subject(), role);
post(event); post(event);
} else {
log.info("No event to publish");
} }
} }
@ -270,32 +273,34 @@ public class DeviceManager
checkNotNull(deviceId, DEVICE_ID_NULL); checkNotNull(deviceId, DEVICE_ID_NULL);
checkValidity(); checkValidity();
// FIXME: only the MASTER should be marking off-line in normal cases,
// but if I was the last STANDBY connection, etc. and no one else
// was there to mark the device offline, this instance may need to
// temporarily request for Master Role and mark offline.
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
log.debug("Device {} disconnected, but I am not the master", deviceId);
//let go of ability to be backup
mastershipService.relinquishMastership(deviceId);
return;
}
DeviceEvent event = null; DeviceEvent event = null;
try { try {
event = store.markOffline(deviceId); event = store.markOffline(deviceId);
} catch (IllegalStateException e) { } catch (IllegalStateException e) {
log.warn("Failed to mark {} offline", deviceId);
// only the MASTER should be marking off-line in normal cases,
// but if I was the last STANDBY connection, etc. and no one else
// was there to mark the device offline, this instance may need to
// temporarily request for Master Role and mark offline.
//there are times when this node will correctly have mastership, BUT //there are times when this node will correctly have mastership, BUT
//that isn't reflected in the ClockManager before the device disconnects. //that isn't reflected in the ClockManager before the device disconnects.
//we want to let go of the device anyways, so make sure this happens. //we want to let go of the device anyways, so make sure this happens.
// FIXME: Come up with workaround for above scenario. // FIXME: Store semantics leaking out as IllegalStateException.
// Consider revising store API to handle this scenario.
MastershipRole role = mastershipService.requestRoleFor(deviceId);
MastershipTerm term = termService.getMastershipTerm(deviceId); MastershipTerm term = termService.getMastershipTerm(deviceId);
final NodeId myNodeId = clusterService.getLocalNode().id(); final NodeId myNodeId = clusterService.getLocalNode().id();
// TODO: Move this type of check inside device clock manager, etc. // TODO: Move this type of check inside device clock manager, etc.
if (myNodeId.equals(term.master())) { if (myNodeId.equals(term.master())) {
log.info("Marking {} offline", deviceId);
deviceClockProviderService.setMastershipTerm(deviceId, term); deviceClockProviderService.setMastershipTerm(deviceId, term);
event = store.markOffline(deviceId); event = store.markOffline(deviceId);
} else {
log.error("Failed again marking {} offline. {}", deviceId, role);
} }
} finally { } finally {
//relinquish master role and ability to be backup. //relinquish master role and ability to be backup.
@ -315,14 +320,6 @@ public class DeviceManager
checkNotNull(portDescriptions, checkNotNull(portDescriptions,
"Port descriptions list cannot be null"); "Port descriptions list cannot be null");
checkValidity(); checkValidity();
//XXX what's this doing here?
this.provider().id();
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
// TODO If we become master, then we'll trigger something to update this
// info to fix any inconsistencies that may result during the handoff.
return;
}
List<DeviceEvent> events = store.updatePorts(this.provider().id(), List<DeviceEvent> events = store.updatePorts(this.provider().id(),
deviceId, portDescriptions); deviceId, portDescriptions);
@ -338,12 +335,7 @@ public class DeviceManager
checkNotNull(portDescription, PORT_DESCRIPTION_NULL); checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
checkValidity(); checkValidity();
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { final DeviceEvent event = store.updatePortStatus(this.provider().id(),
// TODO If we become master, then we'll trigger something to update this
// info to fix any inconsistencies that may result during the handoff.
return;
}
DeviceEvent event = store.updatePortStatus(this.provider().id(),
deviceId, portDescription); deviceId, portDescription);
if (event != null) { if (event != null) {
log.info("Device {} port {} status changed", deviceId, event log.info("Device {} port {} status changed", deviceId, event
@ -407,15 +399,16 @@ public class DeviceManager
Device device = getDevice(did); Device device = getDevice(did);
if ((device != null) && !isAvailable(did)) { if ((device != null) && !isAvailable(did)) {
//flag the device as online. Is there a better way to do this? //flag the device as online. Is there a better way to do this?
store.createOrUpdateDevice(device.providerId(), did, DeviceEvent devEvent = store.createOrUpdateDevice(device.providerId(), did,
new DefaultDeviceDescription( new DefaultDeviceDescription(
did.uri(), device.type(), device.manufacturer(), did.uri(), device.type(), device.manufacturer(),
device.hwVersion(), device.swVersion(), device.hwVersion(), device.swVersion(),
device.serialNumber(), device.chassisId())); device.serialNumber(), device.chassisId()));
post(devEvent);
} }
//TODO re-collect device information to fix potential staleness
queryPortInfo(did);
applyRole(did, MastershipRole.MASTER); applyRole(did, MastershipRole.MASTER);
// re-collect device information to fix potential staleness
queryPortInfo(did);
} else if (event.roleInfo().backups().contains(myNodeId)) { } else if (event.roleInfo().backups().contains(myNodeId)) {
applyRole(did, MastershipRole.STANDBY); applyRole(did, MastershipRole.STANDBY);
} }

View File

@ -328,8 +328,8 @@ public class GossipDeviceStore
final Timestamp timestamp = deviceClockService.getTimestamp(deviceId); final Timestamp timestamp = deviceClockService.getTimestamp(deviceId);
final DeviceEvent event = markOfflineInternal(deviceId, timestamp); final DeviceEvent event = markOfflineInternal(deviceId, timestamp);
if (event != null) { if (event != null) {
log.info("Notifying peers of a device offline topology event for deviceId: {}", log.info("Notifying peers of a device offline topology event for deviceId: {} {}",
deviceId); deviceId, timestamp);
try { try {
notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp)); notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp));
} catch (IOException e) { } catch (IOException e) {
@ -399,7 +399,24 @@ public class GossipDeviceStore
DeviceId deviceId, DeviceId deviceId,
List<PortDescription> portDescriptions) { List<PortDescription> portDescriptions) {
final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); final Timestamp newTimestamp;
try {
newTimestamp = deviceClockService.getTimestamp(deviceId);
} catch (IllegalStateException e) {
log.info("Timestamp was not available for device {}", deviceId);
log.debug(" discarding {}", portDescriptions);
// Failed to generate timestamp.
// Possible situation:
// Device connected and became master for short period of time,
// but lost mastership before this instance had the chance to
// retrieve term information.
// Information dropped here is expected to be recoverable by
// device probing after mastership change
return Collections.emptyList();
}
log.info("timestamp for {} {}", deviceId, newTimestamp); log.info("timestamp for {} {}", deviceId, newTimestamp);
final Timestamped<List<PortDescription>> timestampedInput final Timestamped<List<PortDescription>> timestampedInput
@ -580,7 +597,16 @@ public class GossipDeviceStore
DeviceId deviceId, DeviceId deviceId,
PortDescription portDescription) { PortDescription portDescription) {
final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); final Timestamp newTimestamp;
try {
newTimestamp = deviceClockService.getTimestamp(deviceId);
} catch (IllegalStateException e) {
log.info("Timestamp was not available for device {}", deviceId);
log.debug(" discarding {}", portDescription);
// Failed to generate timestamp. Ignoring.
// See updatePorts comment
return null;
}
final Timestamped<PortDescription> deltaDesc final Timestamped<PortDescription> deltaDesc
= new Timestamped<>(portDescription, newTimestamp); = new Timestamped<>(portDescription, newTimestamp);
final DeviceEvent event; final DeviceEvent event;

View File

@ -111,11 +111,22 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
log.error("Device is null."); log.error("Device is null.");
return; return;
} }
log.trace("{} {} {}", event.type(), event.subject(), event);
switch (event.type()) { switch (event.type()) {
case DEVICE_ADDED: case DEVICE_ADDED:
case DEVICE_UPDATED:
ld = discoverers.get(device.id());
if (ld == null) {
log.debug("Device added ({}) {}", event.type(), device.id());
discoverers.put(device.id(), discoverers.put(device.id(),
new LinkDiscovery(device, packetSevice, masterService, new LinkDiscovery(device, packetSevice, masterService,
providerService, useBDDP)); providerService, useBDDP));
} else {
if (ld.isStopped()) {
log.debug("Device restarted ({}) {}", event.type(), device.id());
ld.start();
}
}
break; break;
case PORT_ADDED: case PORT_ADDED:
case PORT_UPDATED: case PORT_UPDATED:
@ -125,6 +136,7 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
return; return;
} }
if (!port.number().isLogical()) { if (!port.number().isLogical()) {
log.debug("Port added {}", port);
ld.addPort(port); ld.addPort(port);
} }
} else { } else {
@ -134,12 +146,15 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
} }
break; break;
case PORT_REMOVED: case PORT_REMOVED:
log.debug("Port removed {}", port);
ConnectPoint point = new ConnectPoint(device.id(), ConnectPoint point = new ConnectPoint(device.id(),
port.number()); port.number());
providerService.linksVanished(point); providerService.linksVanished(point);
// TODO: Don't we need to removePort from ld?
break; break;
case DEVICE_REMOVED: case DEVICE_REMOVED:
case DEVICE_SUSPENDED: case DEVICE_SUSPENDED:
log.debug("Device removed {}", device.id());
ld = discoverers.get(device.id()); ld = discoverers.get(device.id());
if (ld == null) { if (ld == null) {
return; return;
@ -153,15 +168,18 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
return; return;
} }
if (deviceService.isAvailable(device.id())) { if (deviceService.isAvailable(device.id())) {
log.debug("Device up {}", device.id());
ld.start(); ld.start();
} else { } else {
providerService.linksVanished(device.id()); providerService.linksVanished(device.id());
log.debug("Device down {}", device.id());
ld.stop(); ld.stop();
} }
break; break;
case DEVICE_UPDATED:
case DEVICE_MASTERSHIP_CHANGED: case DEVICE_MASTERSHIP_CHANGED:
if (!discoverers.containsKey(device.id())) { if (!discoverers.containsKey(device.id())) {
// TODO: ideally, should never reach here
log.debug("Device mastership changed ({}) {}", event.type(), device.id());
discoverers.put(device.id(), discoverers.put(device.id(),
new LinkDiscovery(device, packetSevice, masterService, new LinkDiscovery(device, packetSevice, masterService,
providerService, useBDDP)); providerService, useBDDP));

View File

@ -139,8 +139,8 @@ public class LinkDiscovery implements TimerTask {
* @param port the port * @param port the port
*/ */
public void addPort(final Port port) { public void addPort(final Port port) {
this.log.debug("sending init probe to port {}", this.log.debug("sending init probe to port {}@{}",
port.number().toLong()); port.number().toLong(), device.id());
sendProbes(port.number().toLong()); sendProbes(port.number().toLong());
@ -245,7 +245,7 @@ public class LinkDiscovery implements TimerTask {
*/ */
@Override @Override
public void run(final Timeout t) { public void run(final Timeout t) {
this.log.trace("sending probes"); this.log.trace("sending probes from {}", device.id());
synchronized (this) { synchronized (this) {
final Iterator<Long> fastIterator = this.fastPorts.iterator(); final Iterator<Long> fastIterator = this.fastPorts.iterator();
Long portNumber; Long portNumber;
@ -256,7 +256,7 @@ public class LinkDiscovery implements TimerTask {
.getAndIncrement(); .getAndIncrement();
if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) { if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) {
this.log.trace("sending fast probe to port"); this.log.trace("sending fast probe to port {}", portNumber);
sendProbes(portNumber); sendProbes(portNumber);
} else { } else {
// Update fast and slow ports // Update fast and slow ports
@ -356,6 +356,7 @@ public class LinkDiscovery implements TimerTask {
if (device.type() != Device.Type.ROADM && if (device.type() != Device.Type.ROADM &&
mastershipService.getLocalRole(this.device.id()) == mastershipService.getLocalRole(this.device.id()) ==
MastershipRole.MASTER) { MastershipRole.MASTER) {
log.debug("sending probes out to {}@{}", portNumber, device.id());
OutboundPacket pkt = this.createOutBoundLLDP(portNumber); OutboundPacket pkt = this.createOutBoundLLDP(portNumber);
pktService.emit(pkt); pktService.emit(pkt);
if (useBDDP) { if (useBDDP) {

View File

@ -96,7 +96,9 @@ public class OpenFlowDeviceProvider extends AbstractProvider implements DevicePr
// FIXME if possible, we might want this to be part of // FIXME if possible, we might want this to be part of
// OpenFlowSwitch interface so the driver interface isn't misused. // OpenFlowSwitch interface so the driver interface isn't misused.
OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri())); OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri()));
if (!((OpenFlowSwitchDriver) sw).isConnected()) { if (sw == null ||
!((OpenFlowSwitchDriver) sw).isConnected()) {
LOG.error("Failed to probe device {} on sw={}", device, sw);
providerService.deviceDisconnected(device.id()); providerService.deviceDisconnected(device.id());
return; return;
} }