Files
start-os/core/src/util/logger.rs
Aiden McClelland 2bb1463f4f fix: mitigate tokio I/O driver starvation (tokio-rs/tokio#4730)
Tokio's multi-thread scheduler has an unfixed vulnerability where all
worker threads can end up parked on condvars with no worker driving the
I/O reactor.  Condvar-parked workers have no timeout and sleep
indefinitely, so once in this state the runtime never recovers.

This was observed on a box migrating from 0.3.5.1: after heavy task
churn (package reinstalls, container operations, logging) all 16 workers
ended up on futex_wait with no thread on epoll_wait.  The web server
listened on both HTTP and HTTPS but never replied.  The box was stuck
for 7+ hours with 0% CPU.

Two mitigations:

1. Watchdog OS thread (startd.rs): a plain std::thread that every 30s
   injects a no-op task via Handle::spawn.  This forces a condvar-parked
   worker to wake, cycle through park, and grab the driver TryLock —
   breaking the stall regardless of what triggered it.

2. block_in_place in the logger (logger.rs): the TeeWriter holds a
   std::sync::Mutex across blocking file + stderr writes on worker
   threads.  Wrapping in block_in_place tells tokio to hand off driver
   duties before the worker blocks, reducing the window for starvation.
   Guarded by runtime_flavor() to avoid panicking on current-thread
   runtimes used by the CLI.
2026-03-25 10:14:03 -06:00

126 lines
3.9 KiB
Rust

use std::fs::File;
use std::io::{self, Write};
use std::sync::{Arc, Mutex, MutexGuard};
use lazy_static::lazy_static;
use tracing::Subscriber;
use tracing_subscriber::fmt::MakeWriter;
use tracing_subscriber::util::SubscriberInitExt;
lazy_static! {
pub static ref LOGGER: StartOSLogger = StartOSLogger::init();
}
#[derive(Clone)]
pub struct StartOSLogger {
logfile: LogFile,
}
#[derive(Clone, Default)]
struct LogFile(Arc<Mutex<Option<File>>>);
impl<'a> MakeWriter<'a> for LogFile {
type Writer = Box<dyn Write + 'a>;
fn make_writer(&'a self) -> Self::Writer {
let f = self.0.lock().unwrap();
if f.is_some() {
struct TeeWriter<'a>(MutexGuard<'a, Option<File>>);
impl<'a> Write for TeeWriter<'a> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
// Blocking file+stderr I/O on a tokio worker thread can
// starve the I/O driver (tokio-rs/tokio#4730).
// block_in_place tells the runtime to hand off driver
// duties before we block. Only available on the
// multi-thread runtime; falls back to a direct write on
// current-thread runtimes (CLI) or outside a runtime.
if matches!(
tokio::runtime::Handle::try_current().map(|h| h.runtime_flavor()),
Ok(tokio::runtime::RuntimeFlavor::MultiThread),
) {
tokio::task::block_in_place(|| self.write_inner(buf))
} else {
self.write_inner(buf)
}
}
fn flush(&mut self) -> io::Result<()> {
if let Some(f) = &mut *self.0 {
f.flush()?;
}
Ok(())
}
}
impl<'a> TeeWriter<'a> {
fn write_inner(&mut self, buf: &[u8]) -> io::Result<usize> {
let n = if let Some(f) = &mut *self.0 {
f.write(buf)?
} else {
buf.len()
};
io::stderr().write_all(&buf[..n])?;
Ok(n)
}
}
Box::new(TeeWriter(f))
} else {
drop(f);
Box::new(io::stderr())
}
}
}
impl StartOSLogger {
pub fn enable(&self) {}
pub fn set_logfile(&self, logfile: Option<File>) {
*self.logfile.0.lock().unwrap() = logfile;
}
fn base_subscriber(logfile: LogFile) -> impl Subscriber {
use tracing_error::ErrorLayer;
use tracing_subscriber::prelude::*;
use tracing_subscriber::{EnvFilter, fmt};
let filter_layer = || {
EnvFilter::builder()
.with_default_directive(
format!("{}=info", std::module_path!().split("::").next().unwrap())
.parse()
.unwrap(),
)
.from_env_lossy()
};
let fmt_layer = fmt::layer()
.with_writer(logfile)
.with_line_number(true)
.with_file(true)
.with_target(true)
.with_filter(filter_layer());
let sub = tracing_subscriber::registry();
#[cfg(feature = "console-subscriber")]
let sub = sub.with(console_subscriber::spawn());
let sub = sub.with(fmt_layer).with(ErrorLayer::default());
sub
}
fn init() -> Self {
let logfile = LogFile::default();
Self::base_subscriber(logfile.clone()).init();
color_eyre::install().unwrap_or_else(|_| tracing::warn!("tracing too many times"));
StartOSLogger { logfile }
}
}
#[tokio::test]
pub async fn order_level() {
assert!(tracing::Level::WARN > tracing::Level::ERROR)
}
#[test]
pub fn module() {
println!("{}", module_path!())
}