mirror of
https://github.com/opennetworkinglab/onos.git
synced 2025-11-07 19:52:03 +01:00
Attempt to fix Link vanishing after mastership change
- LLDPLinkProvider: handle DeviceUpdate event - DeviceManager: publish Device events caused by Mastership change - DeviceManager: Always try to markOffLine on deviceDisconnected - GossipDeviceStore: Silently ignore failure to get Timestamp on port events Change-Id: I51fbb3f1924007867512f20e62d6d53090c63640
This commit is contained in:
parent
fd5cdf191f
commit
eb24e9d0ac
@ -221,9 +221,15 @@ public class DeviceManager
|
|||||||
log.info("Device {} connected", deviceId);
|
log.info("Device {} connected", deviceId);
|
||||||
// check my Role
|
// check my Role
|
||||||
MastershipRole role = mastershipService.requestRoleFor(deviceId);
|
MastershipRole role = mastershipService.requestRoleFor(deviceId);
|
||||||
|
log.info("requestedRole, became {} for {}", role, deviceId);
|
||||||
if (role != MastershipRole.MASTER) {
|
if (role != MastershipRole.MASTER) {
|
||||||
// TODO: Do we need to explicitly tell the Provider that
|
// TODO: Do we need to explicitly tell the Provider that
|
||||||
// this instance is no longer the MASTER? probably not
|
// this instance is no longer the MASTER? probably not
|
||||||
|
// Device device = getDevice(deviceId);
|
||||||
|
// if (device != null) {
|
||||||
|
// // FIXME roleChanged should take DeviceId instead of Device
|
||||||
|
// provider().roleChanged(device, role);
|
||||||
|
// }
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
MastershipTerm term = mastershipService.requestTermService()
|
MastershipTerm term = mastershipService.requestTermService()
|
||||||
@ -231,6 +237,7 @@ public class DeviceManager
|
|||||||
|
|
||||||
if (!term.master().equals(clusterService.getLocalNode().id())) {
|
if (!term.master().equals(clusterService.getLocalNode().id())) {
|
||||||
// lost mastership after requestRole told this instance was MASTER.
|
// lost mastership after requestRole told this instance was MASTER.
|
||||||
|
log.info("lost mastership before getting term info.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -251,17 +258,13 @@ public class DeviceManager
|
|||||||
// instance is the new Master, but
|
// instance is the new Master, but
|
||||||
// event returned from the store is null?
|
// event returned from the store is null?
|
||||||
|
|
||||||
// TODO: Confirm: Mastership could be lost after requestRole
|
|
||||||
// and createOrUpdateDevice call.
|
|
||||||
// In that case STANDBY node can
|
|
||||||
// claim itself to be master against the Device.
|
|
||||||
// Will the Node, chosen by the MastershipService, retry
|
|
||||||
// to get the MASTER role when that happen?
|
|
||||||
|
|
||||||
// FIXME: 1st argument should be deviceId, to allow setting
|
// FIXME: 1st argument should be deviceId, to allow setting
|
||||||
// certain roles even if the store returned null.
|
// certain roles even if the store returned null.
|
||||||
|
log.info("event: {} {}", event.type(), event);
|
||||||
provider().roleChanged(event.subject(), role);
|
provider().roleChanged(event.subject(), role);
|
||||||
post(event);
|
post(event);
|
||||||
|
} else {
|
||||||
|
log.info("No event to publish");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -270,32 +273,34 @@ public class DeviceManager
|
|||||||
checkNotNull(deviceId, DEVICE_ID_NULL);
|
checkNotNull(deviceId, DEVICE_ID_NULL);
|
||||||
checkValidity();
|
checkValidity();
|
||||||
|
|
||||||
// FIXME: only the MASTER should be marking off-line in normal cases,
|
|
||||||
// but if I was the last STANDBY connection, etc. and no one else
|
|
||||||
// was there to mark the device offline, this instance may need to
|
|
||||||
// temporarily request for Master Role and mark offline.
|
|
||||||
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
|
|
||||||
log.debug("Device {} disconnected, but I am not the master", deviceId);
|
|
||||||
//let go of ability to be backup
|
|
||||||
mastershipService.relinquishMastership(deviceId);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
DeviceEvent event = null;
|
DeviceEvent event = null;
|
||||||
try {
|
try {
|
||||||
event = store.markOffline(deviceId);
|
event = store.markOffline(deviceId);
|
||||||
} catch (IllegalStateException e) {
|
} catch (IllegalStateException e) {
|
||||||
|
log.warn("Failed to mark {} offline", deviceId);
|
||||||
|
// only the MASTER should be marking off-line in normal cases,
|
||||||
|
// but if I was the last STANDBY connection, etc. and no one else
|
||||||
|
// was there to mark the device offline, this instance may need to
|
||||||
|
// temporarily request for Master Role and mark offline.
|
||||||
|
|
||||||
//there are times when this node will correctly have mastership, BUT
|
//there are times when this node will correctly have mastership, BUT
|
||||||
//that isn't reflected in the ClockManager before the device disconnects.
|
//that isn't reflected in the ClockManager before the device disconnects.
|
||||||
//we want to let go of the device anyways, so make sure this happens.
|
//we want to let go of the device anyways, so make sure this happens.
|
||||||
|
|
||||||
// FIXME: Come up with workaround for above scenario.
|
// FIXME: Store semantics leaking out as IllegalStateException.
|
||||||
|
// Consider revising store API to handle this scenario.
|
||||||
|
|
||||||
|
MastershipRole role = mastershipService.requestRoleFor(deviceId);
|
||||||
MastershipTerm term = termService.getMastershipTerm(deviceId);
|
MastershipTerm term = termService.getMastershipTerm(deviceId);
|
||||||
final NodeId myNodeId = clusterService.getLocalNode().id();
|
final NodeId myNodeId = clusterService.getLocalNode().id();
|
||||||
// TODO: Move this type of check inside device clock manager, etc.
|
// TODO: Move this type of check inside device clock manager, etc.
|
||||||
if (myNodeId.equals(term.master())) {
|
if (myNodeId.equals(term.master())) {
|
||||||
|
log.info("Marking {} offline", deviceId);
|
||||||
deviceClockProviderService.setMastershipTerm(deviceId, term);
|
deviceClockProviderService.setMastershipTerm(deviceId, term);
|
||||||
event = store.markOffline(deviceId);
|
event = store.markOffline(deviceId);
|
||||||
|
} else {
|
||||||
|
log.error("Failed again marking {} offline. {}", deviceId, role);
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
//relinquish master role and ability to be backup.
|
//relinquish master role and ability to be backup.
|
||||||
@ -315,14 +320,6 @@ public class DeviceManager
|
|||||||
checkNotNull(portDescriptions,
|
checkNotNull(portDescriptions,
|
||||||
"Port descriptions list cannot be null");
|
"Port descriptions list cannot be null");
|
||||||
checkValidity();
|
checkValidity();
|
||||||
//XXX what's this doing here?
|
|
||||||
this.provider().id();
|
|
||||||
|
|
||||||
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
|
|
||||||
// TODO If we become master, then we'll trigger something to update this
|
|
||||||
// info to fix any inconsistencies that may result during the handoff.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<DeviceEvent> events = store.updatePorts(this.provider().id(),
|
List<DeviceEvent> events = store.updatePorts(this.provider().id(),
|
||||||
deviceId, portDescriptions);
|
deviceId, portDescriptions);
|
||||||
@ -338,12 +335,7 @@ public class DeviceManager
|
|||||||
checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
|
checkNotNull(portDescription, PORT_DESCRIPTION_NULL);
|
||||||
checkValidity();
|
checkValidity();
|
||||||
|
|
||||||
if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) {
|
final DeviceEvent event = store.updatePortStatus(this.provider().id(),
|
||||||
// TODO If we become master, then we'll trigger something to update this
|
|
||||||
// info to fix any inconsistencies that may result during the handoff.
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
DeviceEvent event = store.updatePortStatus(this.provider().id(),
|
|
||||||
deviceId, portDescription);
|
deviceId, portDescription);
|
||||||
if (event != null) {
|
if (event != null) {
|
||||||
log.info("Device {} port {} status changed", deviceId, event
|
log.info("Device {} port {} status changed", deviceId, event
|
||||||
@ -407,15 +399,16 @@ public class DeviceManager
|
|||||||
Device device = getDevice(did);
|
Device device = getDevice(did);
|
||||||
if ((device != null) && !isAvailable(did)) {
|
if ((device != null) && !isAvailable(did)) {
|
||||||
//flag the device as online. Is there a better way to do this?
|
//flag the device as online. Is there a better way to do this?
|
||||||
store.createOrUpdateDevice(device.providerId(), did,
|
DeviceEvent devEvent = store.createOrUpdateDevice(device.providerId(), did,
|
||||||
new DefaultDeviceDescription(
|
new DefaultDeviceDescription(
|
||||||
did.uri(), device.type(), device.manufacturer(),
|
did.uri(), device.type(), device.manufacturer(),
|
||||||
device.hwVersion(), device.swVersion(),
|
device.hwVersion(), device.swVersion(),
|
||||||
device.serialNumber(), device.chassisId()));
|
device.serialNumber(), device.chassisId()));
|
||||||
|
post(devEvent);
|
||||||
}
|
}
|
||||||
//TODO re-collect device information to fix potential staleness
|
|
||||||
queryPortInfo(did);
|
|
||||||
applyRole(did, MastershipRole.MASTER);
|
applyRole(did, MastershipRole.MASTER);
|
||||||
|
// re-collect device information to fix potential staleness
|
||||||
|
queryPortInfo(did);
|
||||||
} else if (event.roleInfo().backups().contains(myNodeId)) {
|
} else if (event.roleInfo().backups().contains(myNodeId)) {
|
||||||
applyRole(did, MastershipRole.STANDBY);
|
applyRole(did, MastershipRole.STANDBY);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -328,8 +328,8 @@ public class GossipDeviceStore
|
|||||||
final Timestamp timestamp = deviceClockService.getTimestamp(deviceId);
|
final Timestamp timestamp = deviceClockService.getTimestamp(deviceId);
|
||||||
final DeviceEvent event = markOfflineInternal(deviceId, timestamp);
|
final DeviceEvent event = markOfflineInternal(deviceId, timestamp);
|
||||||
if (event != null) {
|
if (event != null) {
|
||||||
log.info("Notifying peers of a device offline topology event for deviceId: {}",
|
log.info("Notifying peers of a device offline topology event for deviceId: {} {}",
|
||||||
deviceId);
|
deviceId, timestamp);
|
||||||
try {
|
try {
|
||||||
notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp));
|
notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp));
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
@ -399,7 +399,24 @@ public class GossipDeviceStore
|
|||||||
DeviceId deviceId,
|
DeviceId deviceId,
|
||||||
List<PortDescription> portDescriptions) {
|
List<PortDescription> portDescriptions) {
|
||||||
|
|
||||||
final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
|
final Timestamp newTimestamp;
|
||||||
|
try {
|
||||||
|
newTimestamp = deviceClockService.getTimestamp(deviceId);
|
||||||
|
} catch (IllegalStateException e) {
|
||||||
|
log.info("Timestamp was not available for device {}", deviceId);
|
||||||
|
log.debug(" discarding {}", portDescriptions);
|
||||||
|
// Failed to generate timestamp.
|
||||||
|
|
||||||
|
// Possible situation:
|
||||||
|
// Device connected and became master for short period of time,
|
||||||
|
// but lost mastership before this instance had the chance to
|
||||||
|
// retrieve term information.
|
||||||
|
|
||||||
|
// Information dropped here is expected to be recoverable by
|
||||||
|
// device probing after mastership change
|
||||||
|
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
log.info("timestamp for {} {}", deviceId, newTimestamp);
|
log.info("timestamp for {} {}", deviceId, newTimestamp);
|
||||||
|
|
||||||
final Timestamped<List<PortDescription>> timestampedInput
|
final Timestamped<List<PortDescription>> timestampedInput
|
||||||
@ -580,7 +597,16 @@ public class GossipDeviceStore
|
|||||||
DeviceId deviceId,
|
DeviceId deviceId,
|
||||||
PortDescription portDescription) {
|
PortDescription portDescription) {
|
||||||
|
|
||||||
final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId);
|
final Timestamp newTimestamp;
|
||||||
|
try {
|
||||||
|
newTimestamp = deviceClockService.getTimestamp(deviceId);
|
||||||
|
} catch (IllegalStateException e) {
|
||||||
|
log.info("Timestamp was not available for device {}", deviceId);
|
||||||
|
log.debug(" discarding {}", portDescription);
|
||||||
|
// Failed to generate timestamp. Ignoring.
|
||||||
|
// See updatePorts comment
|
||||||
|
return null;
|
||||||
|
}
|
||||||
final Timestamped<PortDescription> deltaDesc
|
final Timestamped<PortDescription> deltaDesc
|
||||||
= new Timestamped<>(portDescription, newTimestamp);
|
= new Timestamped<>(portDescription, newTimestamp);
|
||||||
final DeviceEvent event;
|
final DeviceEvent event;
|
||||||
|
|||||||
@ -111,11 +111,22 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
|
|||||||
log.error("Device is null.");
|
log.error("Device is null.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
log.trace("{} {} {}", event.type(), event.subject(), event);
|
||||||
switch (event.type()) {
|
switch (event.type()) {
|
||||||
case DEVICE_ADDED:
|
case DEVICE_ADDED:
|
||||||
|
case DEVICE_UPDATED:
|
||||||
|
ld = discoverers.get(device.id());
|
||||||
|
if (ld == null) {
|
||||||
|
log.debug("Device added ({}) {}", event.type(), device.id());
|
||||||
discoverers.put(device.id(),
|
discoverers.put(device.id(),
|
||||||
new LinkDiscovery(device, packetSevice, masterService,
|
new LinkDiscovery(device, packetSevice, masterService,
|
||||||
providerService, useBDDP));
|
providerService, useBDDP));
|
||||||
|
} else {
|
||||||
|
if (ld.isStopped()) {
|
||||||
|
log.debug("Device restarted ({}) {}", event.type(), device.id());
|
||||||
|
ld.start();
|
||||||
|
}
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
case PORT_ADDED:
|
case PORT_ADDED:
|
||||||
case PORT_UPDATED:
|
case PORT_UPDATED:
|
||||||
@ -125,6 +136,7 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!port.number().isLogical()) {
|
if (!port.number().isLogical()) {
|
||||||
|
log.debug("Port added {}", port);
|
||||||
ld.addPort(port);
|
ld.addPort(port);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@ -134,12 +146,15 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case PORT_REMOVED:
|
case PORT_REMOVED:
|
||||||
|
log.debug("Port removed {}", port);
|
||||||
ConnectPoint point = new ConnectPoint(device.id(),
|
ConnectPoint point = new ConnectPoint(device.id(),
|
||||||
port.number());
|
port.number());
|
||||||
providerService.linksVanished(point);
|
providerService.linksVanished(point);
|
||||||
|
// TODO: Don't we need to removePort from ld?
|
||||||
break;
|
break;
|
||||||
case DEVICE_REMOVED:
|
case DEVICE_REMOVED:
|
||||||
case DEVICE_SUSPENDED:
|
case DEVICE_SUSPENDED:
|
||||||
|
log.debug("Device removed {}", device.id());
|
||||||
ld = discoverers.get(device.id());
|
ld = discoverers.get(device.id());
|
||||||
if (ld == null) {
|
if (ld == null) {
|
||||||
return;
|
return;
|
||||||
@ -153,15 +168,18 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (deviceService.isAvailable(device.id())) {
|
if (deviceService.isAvailable(device.id())) {
|
||||||
|
log.debug("Device up {}", device.id());
|
||||||
ld.start();
|
ld.start();
|
||||||
} else {
|
} else {
|
||||||
providerService.linksVanished(device.id());
|
providerService.linksVanished(device.id());
|
||||||
|
log.debug("Device down {}", device.id());
|
||||||
ld.stop();
|
ld.stop();
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case DEVICE_UPDATED:
|
|
||||||
case DEVICE_MASTERSHIP_CHANGED:
|
case DEVICE_MASTERSHIP_CHANGED:
|
||||||
if (!discoverers.containsKey(device.id())) {
|
if (!discoverers.containsKey(device.id())) {
|
||||||
|
// TODO: ideally, should never reach here
|
||||||
|
log.debug("Device mastership changed ({}) {}", event.type(), device.id());
|
||||||
discoverers.put(device.id(),
|
discoverers.put(device.id(),
|
||||||
new LinkDiscovery(device, packetSevice, masterService,
|
new LinkDiscovery(device, packetSevice, masterService,
|
||||||
providerService, useBDDP));
|
providerService, useBDDP));
|
||||||
|
|||||||
@ -139,8 +139,8 @@ public class LinkDiscovery implements TimerTask {
|
|||||||
* @param port the port
|
* @param port the port
|
||||||
*/
|
*/
|
||||||
public void addPort(final Port port) {
|
public void addPort(final Port port) {
|
||||||
this.log.debug("sending init probe to port {}",
|
this.log.debug("sending init probe to port {}@{}",
|
||||||
port.number().toLong());
|
port.number().toLong(), device.id());
|
||||||
|
|
||||||
sendProbes(port.number().toLong());
|
sendProbes(port.number().toLong());
|
||||||
|
|
||||||
@ -245,7 +245,7 @@ public class LinkDiscovery implements TimerTask {
|
|||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void run(final Timeout t) {
|
public void run(final Timeout t) {
|
||||||
this.log.trace("sending probes");
|
this.log.trace("sending probes from {}", device.id());
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
final Iterator<Long> fastIterator = this.fastPorts.iterator();
|
final Iterator<Long> fastIterator = this.fastPorts.iterator();
|
||||||
Long portNumber;
|
Long portNumber;
|
||||||
@ -256,7 +256,7 @@ public class LinkDiscovery implements TimerTask {
|
|||||||
.getAndIncrement();
|
.getAndIncrement();
|
||||||
|
|
||||||
if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) {
|
if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) {
|
||||||
this.log.trace("sending fast probe to port");
|
this.log.trace("sending fast probe to port {}", portNumber);
|
||||||
sendProbes(portNumber);
|
sendProbes(portNumber);
|
||||||
} else {
|
} else {
|
||||||
// Update fast and slow ports
|
// Update fast and slow ports
|
||||||
@ -356,6 +356,7 @@ public class LinkDiscovery implements TimerTask {
|
|||||||
if (device.type() != Device.Type.ROADM &&
|
if (device.type() != Device.Type.ROADM &&
|
||||||
mastershipService.getLocalRole(this.device.id()) ==
|
mastershipService.getLocalRole(this.device.id()) ==
|
||||||
MastershipRole.MASTER) {
|
MastershipRole.MASTER) {
|
||||||
|
log.debug("sending probes out to {}@{}", portNumber, device.id());
|
||||||
OutboundPacket pkt = this.createOutBoundLLDP(portNumber);
|
OutboundPacket pkt = this.createOutBoundLLDP(portNumber);
|
||||||
pktService.emit(pkt);
|
pktService.emit(pkt);
|
||||||
if (useBDDP) {
|
if (useBDDP) {
|
||||||
|
|||||||
@ -96,7 +96,9 @@ public class OpenFlowDeviceProvider extends AbstractProvider implements DevicePr
|
|||||||
// FIXME if possible, we might want this to be part of
|
// FIXME if possible, we might want this to be part of
|
||||||
// OpenFlowSwitch interface so the driver interface isn't misused.
|
// OpenFlowSwitch interface so the driver interface isn't misused.
|
||||||
OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri()));
|
OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri()));
|
||||||
if (!((OpenFlowSwitchDriver) sw).isConnected()) {
|
if (sw == null ||
|
||||||
|
!((OpenFlowSwitchDriver) sw).isConnected()) {
|
||||||
|
LOG.error("Failed to probe device {} on sw={}", device, sw);
|
||||||
providerService.deviceDisconnected(device.id());
|
providerService.deviceDisconnected(device.id());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user