mirror of
https://github.com/Start9Labs/start-os.git
synced 2026-03-26 02:11:53 +00:00
handle nvidia-container differently
This commit is contained in:
@@ -15,7 +15,6 @@ use josekit::jwk::Jwk;
|
||||
use reqwest::{Client, Proxy};
|
||||
use rpc_toolkit::yajrc::RpcError;
|
||||
use rpc_toolkit::{CallRemote, Context, Empty};
|
||||
use tokio::process::Command;
|
||||
use tokio::sync::{RwLock, broadcast, oneshot, watch};
|
||||
use tokio::time::Instant;
|
||||
use tracing::instrument;
|
||||
@@ -27,11 +26,6 @@ use crate::context::config::ServerConfig;
|
||||
use crate::db::model::Database;
|
||||
use crate::db::model::package::TaskSeverity;
|
||||
use crate::disk::OsPartitionInfo;
|
||||
use crate::disk::mount::filesystem::bind::Bind;
|
||||
use crate::disk::mount::filesystem::block_dev::BlockDev;
|
||||
use crate::disk::mount::filesystem::loop_dev::LoopDev;
|
||||
use crate::disk::mount::filesystem::{FileSystem, ReadOnly};
|
||||
use crate::disk::mount::guard::MountGuard;
|
||||
use crate::init::{InitResult, check_time_is_synchronized};
|
||||
use crate::install::PKG_ARCHIVE_DIR;
|
||||
use crate::lxc::LxcManager;
|
||||
@@ -47,14 +41,12 @@ use crate::rpc_continuations::{Guid, OpenAuthedContinuations, RpcContinuations};
|
||||
use crate::service::ServiceMap;
|
||||
use crate::service::action::update_tasks;
|
||||
use crate::service::effects::callbacks::ServiceCallbacks;
|
||||
use crate::service::effects::subcontainer::NVIDIA_OVERLAY_PATH;
|
||||
use crate::shutdown::Shutdown;
|
||||
use crate::util::Invoke;
|
||||
use crate::util::future::NonDetachingJoinHandle;
|
||||
use crate::util::io::{TmpDir, delete_file};
|
||||
use crate::util::io::delete_file;
|
||||
use crate::util::lshw::LshwDevice;
|
||||
use crate::util::sync::{SyncMutex, SyncRwLock, Watch};
|
||||
use crate::{ActionId, DATA_DIR, PLATFORM, PackageId};
|
||||
use crate::{ActionId, DATA_DIR, PackageId};
|
||||
|
||||
pub struct RpcContextSeed {
|
||||
is_closed: AtomicBool,
|
||||
@@ -175,86 +167,6 @@ impl RpcContext {
|
||||
init_net_ctrl.complete();
|
||||
tracing::info!("Initialized Net Controller");
|
||||
|
||||
if PLATFORM.ends_with("-nonfree") {
|
||||
if let Err(e) = Command::new("nvidia-smi")
|
||||
.invoke(ErrorKind::ParseSysInfo)
|
||||
.await
|
||||
{
|
||||
tracing::warn!("nvidia-modprobe: {e}");
|
||||
tracing::info!("The above warning can be ignored if no NVIDIA card is present");
|
||||
} else {
|
||||
async {
|
||||
let version: InternedString = String::from_utf8(
|
||||
Command::new("modinfo")
|
||||
.arg("-F")
|
||||
.arg("version")
|
||||
.arg("nvidia")
|
||||
.invoke(ErrorKind::ParseSysInfo)
|
||||
.await?,
|
||||
)?
|
||||
.trim()
|
||||
.into();
|
||||
let sqfs = Path::new("/media/startos/data/package-data/nvidia")
|
||||
.join(&*version)
|
||||
.join("container-overlay.squashfs");
|
||||
if tokio::fs::metadata(&sqfs).await.is_err() {
|
||||
let tmp = TmpDir::new().await?;
|
||||
let procfs = MountGuard::mount(
|
||||
&Bind::new("/proc"),
|
||||
Path::new(&*tmp).join("proc"),
|
||||
ReadOnly,
|
||||
)
|
||||
.await?;
|
||||
Command::new("nvidia-container-cli")
|
||||
.arg("configure")
|
||||
.arg("--no-devbind")
|
||||
.arg("--no-cgroups")
|
||||
.arg("--utility")
|
||||
.arg("--compute")
|
||||
.arg("--graphics")
|
||||
.arg("--video")
|
||||
.arg(&*tmp)
|
||||
.invoke(ErrorKind::Unknown)
|
||||
.await?;
|
||||
procfs.unmount(true).await?;
|
||||
Command::new("ln")
|
||||
.arg("-rsf")
|
||||
.arg(
|
||||
tmp.join("usr/lib64/libnvidia-ml.so")
|
||||
.with_added_extension(&*version),
|
||||
)
|
||||
.arg(tmp.join("usr/lib64/libnvidia-ml.so.1"))
|
||||
.invoke(ErrorKind::Filesystem)
|
||||
.await?;
|
||||
Command::new("chown")
|
||||
.arg("-R")
|
||||
.arg("100000:100000")
|
||||
.arg(&*tmp)
|
||||
.invoke(ErrorKind::Filesystem)
|
||||
.await?;
|
||||
if let Some(p) = sqfs.parent() {
|
||||
tokio::fs::create_dir_all(p)
|
||||
.await
|
||||
.with_ctx(|_| (ErrorKind::Filesystem, format!("mkdir -p {p:?}")))?;
|
||||
}
|
||||
Command::new("mksquashfs")
|
||||
.arg(&*tmp)
|
||||
.arg(&sqfs)
|
||||
.invoke(ErrorKind::Filesystem)
|
||||
.await?;
|
||||
tmp.unmount_and_delete().await?;
|
||||
}
|
||||
BlockDev::new(&sqfs)
|
||||
.mount(NVIDIA_OVERLAY_PATH, ReadOnly)
|
||||
.await?;
|
||||
|
||||
Ok::<_, Error>(())
|
||||
}
|
||||
.await
|
||||
.log_err();
|
||||
}
|
||||
}
|
||||
|
||||
let services = ServiceMap::default();
|
||||
let metrics_cache = Watch::<Option<crate::system::Metrics>>::new(None);
|
||||
let socks_proxy_url = format!("socks5h://{socks_proxy}");
|
||||
|
||||
@@ -147,6 +147,35 @@ impl<G: GenericMountGuard> OverlayGuard<G> {
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
/// Remounts the overlay at a new location. The old mountpoint is unmounted first.
|
||||
pub async fn remount(&mut self, new_mountpoint: impl AsRef<Path>) -> Result<(), Error> {
|
||||
let lower = self.lower.as_ref().ok_or_else(|| {
|
||||
Error::new(
|
||||
eyre!("OverlayGuard has no lower layer"),
|
||||
crate::ErrorKind::Incoherent,
|
||||
)
|
||||
})?;
|
||||
let upper = self.upper.as_ref().ok_or_else(|| {
|
||||
Error::new(
|
||||
eyre!("OverlayGuard has no upper layer"),
|
||||
crate::ErrorKind::Incoherent,
|
||||
)
|
||||
})?;
|
||||
// Unmount from current location
|
||||
self.inner_guard.take().unmount(true).await?;
|
||||
// Remount at new location
|
||||
self.inner_guard = MountGuard::mount(
|
||||
&OverlayFs::new(
|
||||
vec![lower.path()],
|
||||
upper.as_ref().join("upper"),
|
||||
upper.as_ref().join("work"),
|
||||
),
|
||||
new_mountpoint,
|
||||
ReadWrite,
|
||||
)
|
||||
.await?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn take(&mut self) -> Self {
|
||||
Self {
|
||||
lower: self.lower.take(),
|
||||
|
||||
@@ -4,15 +4,15 @@ use imbl_value::InternedString;
|
||||
use tokio::process::Command;
|
||||
|
||||
use crate::ImageId;
|
||||
use crate::disk::mount::filesystem::bind::Bind;
|
||||
use crate::disk::mount::filesystem::overlayfs::OverlayGuard;
|
||||
use crate::disk::mount::guard::GenericMountGuard;
|
||||
use crate::disk::mount::filesystem::ReadOnly;
|
||||
use crate::disk::mount::guard::{GenericMountGuard, MountGuard, TMP_MOUNTPOINT};
|
||||
use crate::rpc_continuations::Guid;
|
||||
use crate::service::effects::prelude::*;
|
||||
use crate::service::persistent_container::Subcontainer;
|
||||
use crate::util::Invoke;
|
||||
|
||||
pub const NVIDIA_OVERLAY_PATH: &str = "/var/tmp/startos/nvidia-overlay";
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
mod sync;
|
||||
|
||||
@@ -104,37 +104,93 @@ pub async fn create_subcontainer_fs(
|
||||
)
|
||||
})?
|
||||
.rootfs_dir();
|
||||
let mountpoint = rootfs_dir
|
||||
let final_mountpoint = rootfs_dir
|
||||
.join("media/startos/subcontainers")
|
||||
.join(guid.as_ref());
|
||||
tokio::fs::create_dir_all(&mountpoint).await?;
|
||||
tokio::fs::create_dir_all(&final_mountpoint).await?;
|
||||
let container_mountpoint = Path::new("/").join(
|
||||
mountpoint
|
||||
.strip_prefix(rootfs_dir)
|
||||
final_mountpoint
|
||||
.strip_prefix(&rootfs_dir)
|
||||
.with_kind(ErrorKind::Incoherent)?,
|
||||
);
|
||||
tracing::info!("Mounting overlay {guid} for {image_id}");
|
||||
let subcontainer_wrapper = Subcontainer {
|
||||
overlay: OverlayGuard::mount_layers(
|
||||
&[],
|
||||
image,
|
||||
if context
|
||||
|
||||
let nvidia_container = context
|
||||
.seed
|
||||
.persistent_container
|
||||
.s9pk
|
||||
.as_manifest()
|
||||
.images
|
||||
.get(&image_id)
|
||||
.map_or(false, |i| i.nvidia_container)
|
||||
&& tokio::fs::metadata(NVIDIA_OVERLAY_PATH).await.is_ok()
|
||||
{
|
||||
&[NVIDIA_OVERLAY_PATH]
|
||||
.map_or(false, |i| i.nvidia_container);
|
||||
|
||||
// If nvidia_container is enabled, we need to stage the overlay outside the LXC rootfs
|
||||
// to safely mount /proc for nvidia-container-cli without exposing it to the container
|
||||
let overlay = if nvidia_container {
|
||||
// Create staging directory outside LXC rootfs
|
||||
let staging_dir = Path::new(TMP_MOUNTPOINT)
|
||||
.join("nvidia-staging")
|
||||
.join(guid.as_ref());
|
||||
tokio::fs::create_dir_all(&staging_dir).await?;
|
||||
|
||||
tracing::info!("Mounting overlay {guid} for {image_id} at staging location");
|
||||
let mut overlay = OverlayGuard::mount(image, &staging_dir).await?;
|
||||
|
||||
// Mount /proc temporarily for nvidia-container-cli (outside LXC rootfs)
|
||||
let staging_proc = staging_dir.join("proc");
|
||||
tokio::fs::create_dir_all(&staging_proc).await?;
|
||||
let proc_mount = MountGuard::mount(&Bind::new("/proc"), &staging_proc, ReadOnly).await?;
|
||||
|
||||
// Read environment variables from the image's env file
|
||||
let env_file = rootfs_dir
|
||||
.join("media/startos/images")
|
||||
.join(image_id.as_ref())
|
||||
.with_extension("env");
|
||||
let env_content = tokio::fs::read_to_string(&env_file)
|
||||
.await
|
||||
.unwrap_or_default();
|
||||
|
||||
// Build nvidia-container-cli command with environment variables
|
||||
let mut cmd = Command::new("nvidia-container-cli");
|
||||
cmd.arg("configure")
|
||||
.arg("--no-cgroups")
|
||||
.arg("--utility")
|
||||
.arg("--compute")
|
||||
.arg("--graphics")
|
||||
.arg("--video");
|
||||
|
||||
// Pass NVIDIA_* environment variables to nvidia-container-cli
|
||||
for line in env_content.lines() {
|
||||
if let Some((key, value)) = line.split_once('=') {
|
||||
if key.starts_with("NVIDIA_") {
|
||||
cmd.env(key, value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cmd.arg(&staging_dir);
|
||||
|
||||
tracing::info!("Running nvidia-container-cli for {image_id}");
|
||||
cmd.invoke(ErrorKind::Unknown).await?;
|
||||
|
||||
// Unmount /proc
|
||||
proc_mount.unmount(false).await?;
|
||||
tracing::info!("nvidia-container-cli completed for {image_id}");
|
||||
|
||||
// Remount overlay at final location inside LXC rootfs
|
||||
tracing::info!("Remounting overlay {guid} to final location");
|
||||
overlay.remount(&final_mountpoint).await?;
|
||||
|
||||
// Clean up staging directory
|
||||
tokio::fs::remove_dir_all(&staging_dir).await.ok();
|
||||
|
||||
overlay
|
||||
} else {
|
||||
&[]
|
||||
},
|
||||
&mountpoint,
|
||||
)
|
||||
.await?,
|
||||
tracing::info!("Mounting overlay {guid} for {image_id}");
|
||||
OverlayGuard::mount(image, &final_mountpoint).await?
|
||||
};
|
||||
|
||||
let subcontainer_wrapper = Subcontainer {
|
||||
overlay,
|
||||
name: name
|
||||
.unwrap_or_else(|| InternedString::intern(format!("subcontainer-{}", image_id))),
|
||||
image_id: image_id.clone(),
|
||||
@@ -142,7 +198,7 @@ pub async fn create_subcontainer_fs(
|
||||
|
||||
Command::new("chown")
|
||||
.arg("100000:100000")
|
||||
.arg(&mountpoint)
|
||||
.arg(&final_mountpoint)
|
||||
.invoke(ErrorKind::Filesystem)
|
||||
.await?;
|
||||
tracing::info!("Mounted overlay {guid} for {image_id}");
|
||||
|
||||
Reference in New Issue
Block a user