diff --git a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java index 3863449c37..76d46b0236 100644 --- a/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java +++ b/core/net/src/main/java/org/onlab/onos/net/device/impl/DeviceManager.java @@ -221,9 +221,15 @@ public class DeviceManager log.info("Device {} connected", deviceId); // check my Role MastershipRole role = mastershipService.requestRoleFor(deviceId); + log.info("requestedRole, became {} for {}", role, deviceId); if (role != MastershipRole.MASTER) { // TODO: Do we need to explicitly tell the Provider that // this instance is no longer the MASTER? probably not +// Device device = getDevice(deviceId); +// if (device != null) { +// // FIXME roleChanged should take DeviceId instead of Device +// provider().roleChanged(device, role); +// } return; } MastershipTerm term = mastershipService.requestTermService() @@ -231,6 +237,7 @@ public class DeviceManager if (!term.master().equals(clusterService.getLocalNode().id())) { // lost mastership after requestRole told this instance was MASTER. + log.info("lost mastership before getting term info."); return; } @@ -251,17 +258,13 @@ public class DeviceManager // instance is the new Master, but // event returned from the store is null? - // TODO: Confirm: Mastership could be lost after requestRole - // and createOrUpdateDevice call. - // In that case STANDBY node can - // claim itself to be master against the Device. - // Will the Node, chosen by the MastershipService, retry - // to get the MASTER role when that happen? - // FIXME: 1st argument should be deviceId, to allow setting // certain roles even if the store returned null. + log.info("event: {} {}", event.type(), event); provider().roleChanged(event.subject(), role); post(event); + } else { + log.info("No event to publish"); } } @@ -270,32 +273,34 @@ public class DeviceManager checkNotNull(deviceId, DEVICE_ID_NULL); checkValidity(); - // FIXME: only the MASTER should be marking off-line in normal cases, - // but if I was the last STANDBY connection, etc. and no one else - // was there to mark the device offline, this instance may need to - // temporarily request for Master Role and mark offline. - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { - log.debug("Device {} disconnected, but I am not the master", deviceId); - //let go of ability to be backup - mastershipService.relinquishMastership(deviceId); - return; - } DeviceEvent event = null; try { event = store.markOffline(deviceId); } catch (IllegalStateException e) { + log.warn("Failed to mark {} offline", deviceId); + // only the MASTER should be marking off-line in normal cases, + // but if I was the last STANDBY connection, etc. and no one else + // was there to mark the device offline, this instance may need to + // temporarily request for Master Role and mark offline. + //there are times when this node will correctly have mastership, BUT //that isn't reflected in the ClockManager before the device disconnects. //we want to let go of the device anyways, so make sure this happens. - // FIXME: Come up with workaround for above scenario. + // FIXME: Store semantics leaking out as IllegalStateException. + // Consider revising store API to handle this scenario. + + MastershipRole role = mastershipService.requestRoleFor(deviceId); MastershipTerm term = termService.getMastershipTerm(deviceId); final NodeId myNodeId = clusterService.getLocalNode().id(); // TODO: Move this type of check inside device clock manager, etc. if (myNodeId.equals(term.master())) { + log.info("Marking {} offline", deviceId); deviceClockProviderService.setMastershipTerm(deviceId, term); event = store.markOffline(deviceId); + } else { + log.error("Failed again marking {} offline. {}", deviceId, role); } } finally { //relinquish master role and ability to be backup. @@ -315,14 +320,6 @@ public class DeviceManager checkNotNull(portDescriptions, "Port descriptions list cannot be null"); checkValidity(); - //XXX what's this doing here? - this.provider().id(); - - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { - // TODO If we become master, then we'll trigger something to update this - // info to fix any inconsistencies that may result during the handoff. - return; - } List events = store.updatePorts(this.provider().id(), deviceId, portDescriptions); @@ -338,13 +335,8 @@ public class DeviceManager checkNotNull(portDescription, PORT_DESCRIPTION_NULL); checkValidity(); - if (!mastershipService.getLocalRole(deviceId).equals(MastershipRole.MASTER)) { - // TODO If we become master, then we'll trigger something to update this - // info to fix any inconsistencies that may result during the handoff. - return; - } - DeviceEvent event = store.updatePortStatus(this.provider().id(), - deviceId, portDescription); + final DeviceEvent event = store.updatePortStatus(this.provider().id(), + deviceId, portDescription); if (event != null) { log.info("Device {} port {} status changed", deviceId, event .port().number()); @@ -407,15 +399,16 @@ public class DeviceManager Device device = getDevice(did); if ((device != null) && !isAvailable(did)) { //flag the device as online. Is there a better way to do this? - store.createOrUpdateDevice(device.providerId(), did, + DeviceEvent devEvent = store.createOrUpdateDevice(device.providerId(), did, new DefaultDeviceDescription( did.uri(), device.type(), device.manufacturer(), device.hwVersion(), device.swVersion(), device.serialNumber(), device.chassisId())); + post(devEvent); } - //TODO re-collect device information to fix potential staleness - queryPortInfo(did); applyRole(did, MastershipRole.MASTER); + // re-collect device information to fix potential staleness + queryPortInfo(did); } else if (event.roleInfo().backups().contains(myNodeId)) { applyRole(did, MastershipRole.STANDBY); } diff --git a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java index 21941b5002..6fbc034d2c 100644 --- a/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java +++ b/core/store/dist/src/main/java/org/onlab/onos/store/device/impl/GossipDeviceStore.java @@ -328,8 +328,8 @@ public class GossipDeviceStore final Timestamp timestamp = deviceClockService.getTimestamp(deviceId); final DeviceEvent event = markOfflineInternal(deviceId, timestamp); if (event != null) { - log.info("Notifying peers of a device offline topology event for deviceId: {}", - deviceId); + log.info("Notifying peers of a device offline topology event for deviceId: {} {}", + deviceId, timestamp); try { notifyPeers(new InternalDeviceOfflineEvent(deviceId, timestamp)); } catch (IOException e) { @@ -399,7 +399,24 @@ public class GossipDeviceStore DeviceId deviceId, List portDescriptions) { - final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); + final Timestamp newTimestamp; + try { + newTimestamp = deviceClockService.getTimestamp(deviceId); + } catch (IllegalStateException e) { + log.info("Timestamp was not available for device {}", deviceId); + log.debug(" discarding {}", portDescriptions); + // Failed to generate timestamp. + + // Possible situation: + // Device connected and became master for short period of time, + // but lost mastership before this instance had the chance to + // retrieve term information. + + // Information dropped here is expected to be recoverable by + // device probing after mastership change + + return Collections.emptyList(); + } log.info("timestamp for {} {}", deviceId, newTimestamp); final Timestamped> timestampedInput @@ -580,7 +597,16 @@ public class GossipDeviceStore DeviceId deviceId, PortDescription portDescription) { - final Timestamp newTimestamp = deviceClockService.getTimestamp(deviceId); + final Timestamp newTimestamp; + try { + newTimestamp = deviceClockService.getTimestamp(deviceId); + } catch (IllegalStateException e) { + log.info("Timestamp was not available for device {}", deviceId); + log.debug(" discarding {}", portDescription); + // Failed to generate timestamp. Ignoring. + // See updatePorts comment + return null; + } final Timestamped deltaDesc = new Timestamped<>(portDescription, newTimestamp); final DeviceEvent event; diff --git a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java index f024414451..8e50378733 100644 --- a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java +++ b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LLDPLinkProvider.java @@ -111,11 +111,22 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { log.error("Device is null."); return; } + log.trace("{} {} {}", event.type(), event.subject(), event); switch (event.type()) { case DEVICE_ADDED: - discoverers.put(device.id(), - new LinkDiscovery(device, packetSevice, masterService, - providerService, useBDDP)); + case DEVICE_UPDATED: + ld = discoverers.get(device.id()); + if (ld == null) { + log.debug("Device added ({}) {}", event.type(), device.id()); + discoverers.put(device.id(), + new LinkDiscovery(device, packetSevice, masterService, + providerService, useBDDP)); + } else { + if (ld.isStopped()) { + log.debug("Device restarted ({}) {}", event.type(), device.id()); + ld.start(); + } + } break; case PORT_ADDED: case PORT_UPDATED: @@ -125,6 +136,7 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { return; } if (!port.number().isLogical()) { + log.debug("Port added {}", port); ld.addPort(port); } } else { @@ -134,12 +146,15 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { } break; case PORT_REMOVED: + log.debug("Port removed {}", port); ConnectPoint point = new ConnectPoint(device.id(), port.number()); providerService.linksVanished(point); + // TODO: Don't we need to removePort from ld? break; case DEVICE_REMOVED: case DEVICE_SUSPENDED: + log.debug("Device removed {}", device.id()); ld = discoverers.get(device.id()); if (ld == null) { return; @@ -153,15 +168,18 @@ public class LLDPLinkProvider extends AbstractProvider implements LinkProvider { return; } if (deviceService.isAvailable(device.id())) { + log.debug("Device up {}", device.id()); ld.start(); } else { providerService.linksVanished(device.id()); + log.debug("Device down {}", device.id()); ld.stop(); } break; - case DEVICE_UPDATED: case DEVICE_MASTERSHIP_CHANGED: if (!discoverers.containsKey(device.id())) { + // TODO: ideally, should never reach here + log.debug("Device mastership changed ({}) {}", event.type(), device.id()); discoverers.put(device.id(), new LinkDiscovery(device, packetSevice, masterService, providerService, useBDDP)); diff --git a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java index e60ed90718..9bcd8c62f7 100644 --- a/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java +++ b/providers/lldp/src/main/java/org/onlab/onos/provider/lldp/impl/LinkDiscovery.java @@ -139,8 +139,8 @@ public class LinkDiscovery implements TimerTask { * @param port the port */ public void addPort(final Port port) { - this.log.debug("sending init probe to port {}", - port.number().toLong()); + this.log.debug("sending init probe to port {}@{}", + port.number().toLong(), device.id()); sendProbes(port.number().toLong()); @@ -245,7 +245,7 @@ public class LinkDiscovery implements TimerTask { */ @Override public void run(final Timeout t) { - this.log.trace("sending probes"); + this.log.trace("sending probes from {}", device.id()); synchronized (this) { final Iterator fastIterator = this.fastPorts.iterator(); Long portNumber; @@ -256,7 +256,7 @@ public class LinkDiscovery implements TimerTask { .getAndIncrement(); if (probeCount < LinkDiscovery.MAX_PROBE_COUNT) { - this.log.trace("sending fast probe to port"); + this.log.trace("sending fast probe to port {}", portNumber); sendProbes(portNumber); } else { // Update fast and slow ports @@ -356,6 +356,7 @@ public class LinkDiscovery implements TimerTask { if (device.type() != Device.Type.ROADM && mastershipService.getLocalRole(this.device.id()) == MastershipRole.MASTER) { + log.debug("sending probes out to {}@{}", portNumber, device.id()); OutboundPacket pkt = this.createOutBoundLLDP(portNumber); pktService.emit(pkt); if (useBDDP) { diff --git a/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java b/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java index 984e8ae3a2..a7ee9e78f6 100644 --- a/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java +++ b/providers/openflow/device/src/main/java/org/onlab/onos/provider/of/device/impl/OpenFlowDeviceProvider.java @@ -96,7 +96,9 @@ public class OpenFlowDeviceProvider extends AbstractProvider implements DevicePr // FIXME if possible, we might want this to be part of // OpenFlowSwitch interface so the driver interface isn't misused. OpenFlowSwitch sw = controller.getSwitch(dpid(device.id().uri())); - if (!((OpenFlowSwitchDriver) sw).isConnected()) { + if (sw == null || + !((OpenFlowSwitchDriver) sw).isConnected()) { + LOG.error("Failed to probe device {} on sw={}", device, sw); providerService.deviceDisconnected(device.id()); return; }