Skip to content

Commit

Permalink
feat: Conditionally remove networkd online dependency on Ubuntu (#5772)
Browse files Browse the repository at this point in the history
Traditionally, cloud-init-network.service (previously
cloud-init.service) waited for network connectivity (via systemd
service ordering) before running. This has caused
cloud-init-network.service to block boot for a significant amount of
time. For the vast majority of boots, this network connectivity
isn't required.

This commit removes the ordering
After=systemd-networkd-wait-online.service, but checks the datasource
and user data in the init-local timeframe to see if network
connectivity will be necessary in the init network timeframe.
If so, when the init network service starts, it will start
systemd-networkd-wait-online.service manually.

This commit affects only Ubuntu due to the various number of service
orderings and network renderers possible, along with the downstream
synchronization needed. However, a new overrideable method in the
Distro class should make this optimization trivial to implement for
any other distro.
  • Loading branch information
TheRealFalcon authored Oct 17, 2024
1 parent 879945f commit e30549e
Show file tree
Hide file tree
Showing 12 changed files with 436 additions and 63 deletions.
115 changes: 114 additions & 1 deletion cloudinit/cmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import traceback
import logging
import yaml
from typing import Tuple, Callable
from typing import Optional, Tuple, Callable, Union

from cloudinit import netinfo
from cloudinit import signal_handler
Expand All @@ -34,11 +34,13 @@
from cloudinit import reporting
from cloudinit import atomic_helper
from cloudinit import lifecycle
from cloudinit import handlers
from cloudinit.log import log_util, loggers
from cloudinit.cmd.devel import read_cfg_paths
from cloudinit.config import cc_set_hostname
from cloudinit.config.modules import Modules
from cloudinit.config.schema import validate_cloudconfig_schema
from cloudinit.lifecycle import log_with_downgradable_level
from cloudinit.reporting import events
from cloudinit.settings import (
PER_INSTANCE,
Expand All @@ -47,6 +49,8 @@
CLOUD_CONFIG,
)

Reason = str

# Welcome message template
WELCOME_MSG_TPL = (
"Cloud-init v. {version} running '{action}' at "
Expand Down Expand Up @@ -319,6 +323,96 @@ def _should_bring_up_interfaces(init, args):
return not args.local


def _should_wait_via_user_data(
raw_config: Optional[Union[str, bytes]]
) -> Tuple[bool, Reason]:
"""Determine if our cloud-config requires us to wait
User data requires us to wait during cloud-init network phase if:
- We have user data that is anything other than cloud-config
- This can likely be further optimized in the future to include
other user data types
- cloud-config contains:
- bootcmd
- random_seed command
- mounts
- write_files with source
"""
if not raw_config:
return False, "no configuration found"

if (
handlers.type_from_starts_with(raw_config.strip()[:13])
!= "text/cloud-config"
):
return True, "non-cloud-config user data found"

try:
parsed_yaml = yaml.safe_load(raw_config)
except Exception as e:
log_with_downgradable_level(
logger=LOG,
version="24.4",
requested_level=logging.WARNING,
msg="Unexpected failure parsing userdata: %s",
args=e,
)
return True, "failed to parse user data as yaml"

# These all have the potential to require network access, so we should wait
if "write_files" in parsed_yaml:
for item in parsed_yaml["write_files"]:
source_dict = item.get("source") or {}
source_uri = source_dict.get("uri", "")
if source_uri and not (source_uri.startswith(("/", "file:"))):
return True, "write_files with source uri found"
return False, "write_files without source uri found"
if parsed_yaml.get("bootcmd"):
return True, "bootcmd found"
if parsed_yaml.get("random_seed", {}).get("command"):
return True, "random_seed command found"
if parsed_yaml.get("mounts"):
return True, "mounts found"
return False, "cloud-config does not contain network requiring elements"


def _should_wait_on_network(
datasource: Optional[sources.DataSource],
) -> Tuple[bool, Reason]:
"""Determine if we should wait on network connectivity for cloud-init.
We need to wait during the cloud-init network phase if:
- We have no datasource
- We have user data that may require network access
"""
if not datasource:
return True, "no datasource found"
user_should_wait, user_reason = _should_wait_via_user_data(
datasource.get_userdata_raw()
)
if user_should_wait:
return True, f"{user_reason} in user data"
vendor_should_wait, vendor_reason = _should_wait_via_user_data(
datasource.get_vendordata_raw()
)
if vendor_should_wait:
return True, f"{vendor_reason} in vendor data"
vendor2_should_wait, vendor2_reason = _should_wait_via_user_data(
datasource.get_vendordata2_raw()
)
if vendor2_should_wait:
return True, f"{vendor2_reason} in vendor data2"

return (
False,
(
f"user data: {user_reason}, "
f"vendor data: {vendor_reason}, "
f"vendor data2: {vendor2_reason}"
),
)


def main_init(name, args):
deps = [sources.DEP_FILESYSTEM, sources.DEP_NETWORK]
if args.local:
Expand Down Expand Up @@ -396,6 +490,9 @@ def main_init(name, args):
mode = sources.DSMODE_LOCAL if args.local else sources.DSMODE_NETWORK

if mode == sources.DSMODE_NETWORK:
if not os.path.exists(init.paths.get_runpath(".skip-network")):
LOG.debug("Will wait for network connectivity before continuing")
init.distro.wait_for_network()
existing = "trust"
sys.stderr.write("%s\n" % (netinfo.debug_info()))
else:
Expand Down Expand Up @@ -463,9 +560,25 @@ def main_init(name, args):
# dhcp clients to advertize this hostname to any DDNS services
# LP: #1746455.
_maybe_set_hostname(init, stage="local", retry_stage="network")

init.apply_network_config(bring_up=bring_up_interfaces)

if mode == sources.DSMODE_LOCAL:
should_wait, reason = _should_wait_on_network(init.datasource)
if should_wait:
LOG.debug(
"Network connectivity determined necessary for "
"cloud-init's network stage. Reason: %s",
reason,
)
else:
LOG.debug(
"Network connectivity determined unnecessary for "
"cloud-init's network stage. Reason: %s",
reason,
)
util.write_file(init.paths.get_runpath(".skip-network"), "")

if init.datasource.dsmode != mode:
LOG.debug(
"[%s] Exiting. datasource %s not in local mode.",
Expand Down
31 changes: 23 additions & 8 deletions cloudinit/distros/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,15 +349,16 @@ def dhcp_client(self) -> dhcp.DhcpClient:
raise dhcp.NoDHCPLeaseMissingDhclientError()

@property
def network_activator(self) -> Optional[Type[activators.NetworkActivator]]:
"""Return the configured network activator for this environment."""
def network_activator(self) -> Type[activators.NetworkActivator]:
"""Return the configured network activator for this environment.
:returns: The network activator class to use
:raises: NoActivatorException if no activator is found
"""
priority = util.get_cfg_by_path(
self._cfg, ("network", "activators"), None
)
try:
return activators.select_activator(priority=priority)
except activators.NoActivatorException:
return None
return activators.select_activator(priority=priority)

@property
def network_renderer(self) -> Renderer:
Expand Down Expand Up @@ -460,8 +461,9 @@ def apply_network_config(self, netconfig, bring_up=False) -> bool:
# Now try to bring them up
if bring_up:
LOG.debug("Bringing up newly configured network interfaces")
network_activator = self.network_activator
if not network_activator:
try:
network_activator = self.network_activator
except activators.NoActivatorException:
LOG.warning(
"No network activator found, not bringing up "
"network interfaces"
Expand Down Expand Up @@ -1574,6 +1576,19 @@ def device_part_info(devpath: str) -> tuple:
# name in /dev/
return diskdevpath, ptnum

def wait_for_network(self) -> None:
"""Ensure that cloud-init has network connectivity.
For most distros, this is a no-op as cloud-init's network service is
ordered in boot to start after network connectivity has been achieved.
As an optimization, distros may opt to order cloud-init's network
service immediately after cloud-init's local service, and only
require network connectivity if it has been deemed necessary.
This method is a hook for distros to implement this optimization.
It is called during cloud-init's network stage if it was determined
that network connectivity is necessary in cloud-init's network stage.
"""


def _apply_hostname_transformations_to_url(url: str, transformations: list):
"""
Expand Down
13 changes: 13 additions & 0 deletions cloudinit/distros/ubuntu.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,15 @@
# This file is part of cloud-init. See LICENSE file for license information.

import copy
import logging

from cloudinit.distros import PREFERRED_NTP_CLIENTS, debian
from cloudinit.distros.package_management.snap import Snap
from cloudinit.net import activators
from cloudinit.net.netplan import CLOUDINIT_NETPLAN_FILE

LOG = logging.getLogger(__name__)


class Distro(debian.Distro):
def __init__(self, name, cfg, paths):
Expand Down Expand Up @@ -49,3 +53,12 @@ def preferred_ntp_clients(self):
if not self._preferred_ntp_clients:
self._preferred_ntp_clients = copy.deepcopy(PREFERRED_NTP_CLIENTS)
return self._preferred_ntp_clients

def wait_for_network(self) -> None:
"""Ensure that cloud-init's network service has network connectivity"""
try:
self.network_activator.wait_for_network()
except activators.NoActivatorException:
LOG.error("Failed to wait for network. No network activator found")
except Exception as e:
LOG.error("Failed to wait for network: %s", e)
1 change: 1 addition & 0 deletions cloudinit/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def __init__(self, path_cfgs: dict, ds=None):
"vendor_scripts": "scripts/vendor",
"warnings": "warnings",
"hotplug.enabled": "hotplug.enabled",
".skip-network": ".skip-network",
}
# Set when a datasource becomes active
self.datasource = ds
Expand Down
65 changes: 42 additions & 23 deletions cloudinit/net/activators.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,9 @@
from typing import Callable, Dict, Iterable, List, Optional, Type, Union

from cloudinit import subp, util
from cloudinit.net.eni import available as eni_available
from cloudinit.net import eni, netplan, network_manager, networkd
from cloudinit.net.netops.iproute2 import Iproute2
from cloudinit.net.netplan import available as netplan_available
from cloudinit.net.network_manager import available as nm_available
from cloudinit.net.network_state import NetworkState
from cloudinit.net.networkd import available as networkd_available

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -88,6 +85,11 @@ def bring_up_all_interfaces(cls, network_state: NetworkState) -> bool:
[i["name"] for i in network_state.iter_interfaces()]
)

@staticmethod
def wait_for_network() -> None:
"""Wait for network to come up."""
raise NotImplementedError()


class IfUpDownActivator(NetworkActivator):
# Note that we're not overriding bring_up_interfaces to pass something
Expand All @@ -97,7 +99,7 @@ class IfUpDownActivator(NetworkActivator):
@staticmethod
def available(target: Optional[str] = None) -> bool:
"""Return true if ifupdown can be used on this system."""
return eni_available(target=target)
return eni.available(target=target)

@staticmethod
def bring_up_interface(device_name: str) -> bool:
Expand Down Expand Up @@ -149,7 +151,7 @@ class NetworkManagerActivator(NetworkActivator):
@staticmethod
def available(target=None) -> bool:
"""Return true if NetworkManager can be used on this system."""
return nm_available(target=target)
return network_manager.available(target=target)

@staticmethod
def bring_up_interface(device_name: str) -> bool:
Expand Down Expand Up @@ -215,7 +217,7 @@ class NetplanActivator(NetworkActivator):
@staticmethod
def available(target=None) -> bool:
"""Return true if netplan can be used on this system."""
return netplan_available(target=target)
return netplan.available(target=target)

@staticmethod
def bring_up_interface(device_name: str) -> bool:
Expand Down Expand Up @@ -269,12 +271,21 @@ def bring_down_interface(device_name: str) -> bool:
NetplanActivator.NETPLAN_CMD, "all", warn_on_stderr=False
)

@staticmethod
def wait_for_network() -> None:
"""On networkd systems, wait for systemd-networkd-wait-online"""
# At the moment, this is only supported using the networkd renderer.
if network_manager.available():
LOG.debug("NetworkManager is enabled, skipping networkd wait")
return
NetworkdActivator.wait_for_network()


class NetworkdActivator(NetworkActivator):
@staticmethod
def available(target=None) -> bool:
"""Return true if ifupdown can be used on this system."""
return networkd_available(target=target)
return networkd.available(target=target)

@staticmethod
def bring_up_interface(device_name: str) -> bool:
Expand All @@ -296,6 +307,13 @@ def bring_down_interface(device_name: str) -> bool:
partial(Iproute2.link_down, device_name)
)

@staticmethod
def wait_for_network() -> None:
"""Wait for systemd-networkd-wait-online."""
subp.subp(
["systemctl", "start", "systemd-networkd-wait-online.service"]
)


# This section is mostly copied and pasted from renderers.py. An abstract
# version to encompass both seems overkill at this point
Expand All @@ -318,35 +336,36 @@ def bring_down_interface(device_name: str) -> bool:

def search_activator(
priority: List[str], target: Union[str, None]
) -> List[Type[NetworkActivator]]:
) -> Optional[Type[NetworkActivator]]:
"""Returns the first available activator from the priority list or None."""
unknown = [i for i in priority if i not in DEFAULT_PRIORITY]
if unknown:
raise ValueError(
"Unknown activators provided in priority list: %s" % unknown
f"Unknown activators provided in priority list: {unknown}"
)
activator_classes = [NAME_TO_ACTIVATOR[name] for name in priority]
return [
activator_cls
for activator_cls in activator_classes
if activator_cls.available(target)
]
return next(
(
activator_cls
for activator_cls in activator_classes
if activator_cls.available(target)
),
None,
)


def select_activator(
priority: Optional[List[str]] = None, target: Optional[str] = None
) -> Type[NetworkActivator]:
if priority is None:
priority = DEFAULT_PRIORITY
found = search_activator(priority, target)
if not found:
tmsg = ""
if target and target != "/":
tmsg = " in target=%s" % target
selected = search_activator(priority, target)
if not selected:
tmsg = f" in target={target}" if target and target != "/" else ""
raise NoActivatorException(
"No available network activators found%s. Searched "
"through list: %s" % (tmsg, priority)
f"No available network activators found{tmsg}. "
f"Searched through list: {priority}"
)
selected = found[0]
LOG.debug(
"Using selected activator: %s from priority: %s", selected, priority
)
Expand Down
Loading

0 comments on commit e30549e

Please sign in to comment.