mirror of
https://github.com/Start9Labs/start-os.git
synced 2026-03-26 10:21:52 +00:00
Tokio's multi-thread scheduler has an unfixed vulnerability where all worker threads can end up parked on condvars with no worker driving the I/O reactor. Condvar-parked workers have no timeout and sleep indefinitely, so once in this state the runtime never recovers. This was observed on a box migrating from 0.3.5.1: after heavy task churn (package reinstalls, container operations, logging) all 16 workers ended up on futex_wait with no thread on epoll_wait. The web server listened on both HTTP and HTTPS but never replied. The box was stuck for 7+ hours with 0% CPU. Two mitigations: 1. Watchdog OS thread (startd.rs): a plain std::thread that every 30s injects a no-op task via Handle::spawn. This forces a condvar-parked worker to wake, cycle through park, and grab the driver TryLock — breaking the stall regardless of what triggered it. 2. block_in_place in the logger (logger.rs): the TeeWriter holds a std::sync::Mutex across blocking file + stderr writes on worker threads. Wrapping in block_in_place tells tokio to hand off driver duties before the worker blocks, reducing the window for starvation. Guarded by runtime_flavor() to avoid panicking on current-thread runtimes used by the CLI.
126 lines
3.9 KiB
Rust
126 lines
3.9 KiB
Rust
use std::fs::File;
|
|
use std::io::{self, Write};
|
|
use std::sync::{Arc, Mutex, MutexGuard};
|
|
|
|
use lazy_static::lazy_static;
|
|
use tracing::Subscriber;
|
|
use tracing_subscriber::fmt::MakeWriter;
|
|
use tracing_subscriber::util::SubscriberInitExt;
|
|
|
|
lazy_static! {
|
|
pub static ref LOGGER: StartOSLogger = StartOSLogger::init();
|
|
}
|
|
|
|
#[derive(Clone)]
|
|
pub struct StartOSLogger {
|
|
logfile: LogFile,
|
|
}
|
|
|
|
#[derive(Clone, Default)]
|
|
struct LogFile(Arc<Mutex<Option<File>>>);
|
|
impl<'a> MakeWriter<'a> for LogFile {
|
|
type Writer = Box<dyn Write + 'a>;
|
|
fn make_writer(&'a self) -> Self::Writer {
|
|
let f = self.0.lock().unwrap();
|
|
if f.is_some() {
|
|
struct TeeWriter<'a>(MutexGuard<'a, Option<File>>);
|
|
impl<'a> Write for TeeWriter<'a> {
|
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
// Blocking file+stderr I/O on a tokio worker thread can
|
|
// starve the I/O driver (tokio-rs/tokio#4730).
|
|
// block_in_place tells the runtime to hand off driver
|
|
// duties before we block. Only available on the
|
|
// multi-thread runtime; falls back to a direct write on
|
|
// current-thread runtimes (CLI) or outside a runtime.
|
|
if matches!(
|
|
tokio::runtime::Handle::try_current().map(|h| h.runtime_flavor()),
|
|
Ok(tokio::runtime::RuntimeFlavor::MultiThread),
|
|
) {
|
|
tokio::task::block_in_place(|| self.write_inner(buf))
|
|
} else {
|
|
self.write_inner(buf)
|
|
}
|
|
}
|
|
fn flush(&mut self) -> io::Result<()> {
|
|
if let Some(f) = &mut *self.0 {
|
|
f.flush()?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
impl<'a> TeeWriter<'a> {
|
|
fn write_inner(&mut self, buf: &[u8]) -> io::Result<usize> {
|
|
let n = if let Some(f) = &mut *self.0 {
|
|
f.write(buf)?
|
|
} else {
|
|
buf.len()
|
|
};
|
|
io::stderr().write_all(&buf[..n])?;
|
|
Ok(n)
|
|
}
|
|
}
|
|
Box::new(TeeWriter(f))
|
|
} else {
|
|
drop(f);
|
|
Box::new(io::stderr())
|
|
}
|
|
}
|
|
}
|
|
|
|
impl StartOSLogger {
|
|
pub fn enable(&self) {}
|
|
|
|
pub fn set_logfile(&self, logfile: Option<File>) {
|
|
*self.logfile.0.lock().unwrap() = logfile;
|
|
}
|
|
|
|
fn base_subscriber(logfile: LogFile) -> impl Subscriber {
|
|
use tracing_error::ErrorLayer;
|
|
use tracing_subscriber::prelude::*;
|
|
use tracing_subscriber::{EnvFilter, fmt};
|
|
|
|
let filter_layer = || {
|
|
EnvFilter::builder()
|
|
.with_default_directive(
|
|
format!("{}=info", std::module_path!().split("::").next().unwrap())
|
|
.parse()
|
|
.unwrap(),
|
|
)
|
|
.from_env_lossy()
|
|
};
|
|
|
|
let fmt_layer = fmt::layer()
|
|
.with_writer(logfile)
|
|
.with_line_number(true)
|
|
.with_file(true)
|
|
.with_target(true)
|
|
.with_filter(filter_layer());
|
|
|
|
let sub = tracing_subscriber::registry();
|
|
|
|
#[cfg(feature = "console-subscriber")]
|
|
let sub = sub.with(console_subscriber::spawn());
|
|
|
|
let sub = sub.with(fmt_layer).with(ErrorLayer::default());
|
|
|
|
sub
|
|
}
|
|
fn init() -> Self {
|
|
let logfile = LogFile::default();
|
|
Self::base_subscriber(logfile.clone()).init();
|
|
color_eyre::install().unwrap_or_else(|_| tracing::warn!("tracing too many times"));
|
|
|
|
StartOSLogger { logfile }
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
pub async fn order_level() {
|
|
assert!(tracing::Level::WARN > tracing::Level::ERROR)
|
|
}
|
|
|
|
#[test]
|
|
pub fn module() {
|
|
println!("{}", module_path!())
|
|
}
|