From 2aa910a3e8ef866422371a8396cfb130a5a86256 Mon Sep 17 00:00:00 2001 From: Aiden McClelland Date: Mon, 23 Mar 2026 01:14:49 -0600 Subject: [PATCH] fix: replace stdio chown with prctl(PR_SET_DUMPABLE) and pipe-wrap After setuid, the kernel clears the dumpable flag, making /proc/self/ entries owned by root. This broke open("/dev/stderr") for non-root users inside subcontainers. The previous fix (chowning /proc/self/fd/*) was dangerous because it chowned whatever file the FD pointed to (could be the journal socket). The proper fix is prctl(PR_SET_DUMPABLE, 1) after setuid, which restores /proc/self/ ownership to the current uid. Additionally, adds a `pipe-wrap` subcommand that wraps a child process with piped stdout/stderr, relaying to the original FDs. This ensures all descendants inherit pipes (which support re-opening via /proc/self/fd/N) even when the outermost FDs are journal sockets. container-runtime.service now uses this wrapper. With pipe-wrap guaranteeing pipe-based FDs, the exec and launch non-TTY paths no longer need their own pipe+relay threads, eliminating the bug where exec would hang when a child daemonized (e.g. pg_ctl start). --- container-runtime/container-runtime.service | 2 +- core/src/service/effects/mod.rs | 4 + core/src/service/effects/subcontainer/sync.rs | 107 +++++++++++++++--- 3 files changed, 98 insertions(+), 15 deletions(-) diff --git a/container-runtime/container-runtime.service b/container-runtime/container-runtime.service index f04150969..e94753327 100644 --- a/container-runtime/container-runtime.service +++ b/container-runtime/container-runtime.service @@ -5,7 +5,7 @@ OnFailure=container-runtime-failure.service [Service] Type=simple Environment=RUST_LOG=startos=debug -ExecStart=/usr/bin/node --experimental-detect-module --trace-warnings /usr/lib/startos/init/index.js +ExecStart=/usr/bin/start-container pipe-wrap /usr/bin/node --experimental-detect-module --trace-warnings /usr/lib/startos/init/index.js Restart=no [Install] diff --git a/core/src/service/effects/mod.rs b/core/src/service/effects/mod.rs index 73d06467f..867865046 100644 --- a/core/src/service/effects/mod.rs +++ b/core/src/service/effects/mod.rs @@ -125,6 +125,10 @@ pub fn handler() -> ParentHandler { .with_call_remote::(), ), ) + .subcommand( + "pipe-wrap", + from_fn_blocking(subcontainer::pipe_wrap).no_display(), + ) // net .subcommand("bind", from_fn_async(net::bind::bind).no_cli()) .subcommand( diff --git a/core/src/service/effects/subcontainer/sync.rs b/core/src/service/effects/subcontainer/sync.rs index e1205a1a1..e6004e71c 100644 --- a/core/src/service/effects/subcontainer/sync.rs +++ b/core/src/service/effects/subcontainer/sync.rs @@ -487,28 +487,14 @@ pub fn launch( } cmd.arg(&chroot); cmd.args(&command); - cmd.stdin(Stdio::piped()); - cmd.stdout(Stdio::piped()); - cmd.stderr(Stdio::piped()); let mut child = cmd .spawn() .map_err(color_eyre::eyre::Report::msg) .with_ctx(|_| (ErrorKind::Filesystem, "spawning child process"))?; send_pid.send(child.id() as i32).unwrap_or_default(); - stdin_send - .send(Box::new(child.stdin.take().unwrap())) - .unwrap_or_default(); - stdout_send - .send(Box::new(child.stdout.take().unwrap())) - .unwrap_or_default(); - stderr_send - .send(Box::new(child.stderr.take().unwrap())) - .unwrap_or_default(); let exit = child .wait() .with_ctx(|_| (ErrorKind::Filesystem, "waiting on child process"))?; - stdout_thread.join().unwrap(); - stderr_thread.map(|t| t.join().unwrap()); if let Some(code) = exit.code() { nix::mount::umount(&chroot.join("proc")) .with_ctx(|_| (ErrorKind::Filesystem, "umount procfs"))?; @@ -767,3 +753,96 @@ pub fn exec( pub fn exec_command(_: ContainerCliContext, params: ExecParams) -> Result<(), Error> { params.exec() } + +/// Wrap a child process so that its stdout/stderr are always pipes, even when +/// the wrapper's own FDs are sockets (e.g. systemd journal). This lets +/// descendants `open("/dev/stderr")` via `/proc/self/fd/2` without ENXIO. +pub fn pipe_wrap( + _: ContainerCliContext, + PipeWrapParams { command }: PipeWrapParams, +) -> Result<(), Error> { + use std::os::fd::AsRawFd; + + let Some(([program], args)) = command.split_at_checked(1) else { + return Err(Error::new( + eyre!("pipe-wrap: command cannot be empty"), + ErrorKind::InvalidRequest, + )); + }; + + let mut cmd = StdCommand::new(program); + cmd.args(args); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + + let mut child = cmd + .spawn() + .with_ctx(|_| (ErrorKind::Filesystem, "pipe-wrap: spawning child process"))?; + + let child_stdout = child.stdout.take().unwrap(); + let child_stderr = child.stderr.take().unwrap(); + + let orig_stdout_fd = std::io::stdout().as_raw_fd(); + let orig_stderr_fd = std::io::stderr().as_raw_fd(); + + // Relay child stdout → original stdout (which may be a socket) + std::thread::spawn(move || { + let mut reader = child_stdout; + let mut buf = [0u8; 8192]; + loop { + match Read::read(&mut reader, &mut buf) { + Ok(0) | Err(_) => break, + Ok(n) => { + let _ = nix::unistd::write( + unsafe { std::os::fd::BorrowedFd::borrow_raw(orig_stdout_fd) }, + &buf[..n], + ); + } + } + } + }); + + // Relay child stderr → original stderr + std::thread::spawn(move || { + let mut reader = child_stderr; + let mut buf = [0u8; 8192]; + loop { + match Read::read(&mut reader, &mut buf) { + Ok(0) | Err(_) => break, + Ok(n) => { + let _ = nix::unistd::write( + unsafe { std::os::fd::BorrowedFd::borrow_raw(orig_stderr_fd) }, + &buf[..n], + ); + } + } + } + }); + + // Forward signals to the child + let child_pid = child.id() as i32; + let mut sig = signal_hook::iterator::Signals::new(FWD_SIGNALS)?; + std::thread::spawn(move || { + for sig in sig.forever() { + match nix::sys::signal::kill( + Pid::from_raw(child_pid), + Some(nix::sys::signal::Signal::try_from(sig).unwrap()), + ) { + Err(Errno::ESRCH) => break, + _ => {} + } + } + }); + + let status = child + .wait() + .with_ctx(|_| (ErrorKind::Filesystem, "pipe-wrap: waiting on child"))?; + std::process::exit(status.code().unwrap_or(1)) +} + +#[derive(Debug, Clone, Serialize, Deserialize, Parser)] +#[group(skip)] +pub struct PipeWrapParams { + #[arg(trailing_var_arg = true, help = "help.arg.command-to-execute")] + command: Vec, +}