mirror of
https://github.com/Start9Labs/start-os.git
synced 2026-03-30 12:11:56 +00:00
reduce health check locking
This commit is contained in:
committed by
Aiden McClelland
parent
d4f2def03f
commit
13f08242dd
@@ -395,7 +395,6 @@ pub struct DepInfo {
|
|||||||
pub version: VersionRange,
|
pub version: VersionRange,
|
||||||
pub requirement: DependencyRequirement,
|
pub requirement: DependencyRequirement,
|
||||||
pub description: Option<String>,
|
pub description: Option<String>,
|
||||||
pub critical: bool,
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
#[model]
|
#[model]
|
||||||
pub config: Option<DependencyConfig>,
|
pub config: Option<DependencyConfig>,
|
||||||
@@ -792,37 +791,10 @@ pub fn break_transitive<'a, Db: DbHandle>(
|
|||||||
error: error.clone(),
|
error: error.clone(),
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
if status.main.running() {
|
status.save(&mut tx).await?;
|
||||||
let transitive_error = if model
|
|
||||||
.clone()
|
|
||||||
.manifest()
|
|
||||||
.dependencies()
|
|
||||||
.idx_model(dependency)
|
|
||||||
.get(&mut tx, true)
|
|
||||||
.await?
|
|
||||||
.into_owned()
|
|
||||||
.ok_or_else(|| {
|
|
||||||
Error::new(
|
|
||||||
eyre!("{} not in listed dependencies", dependency),
|
|
||||||
crate::ErrorKind::Database,
|
|
||||||
)
|
|
||||||
})?
|
|
||||||
.critical
|
|
||||||
{
|
|
||||||
status.main.stop();
|
|
||||||
DependencyError::NotRunning
|
|
||||||
} else {
|
|
||||||
DependencyError::Transitive
|
|
||||||
};
|
|
||||||
status.save(&mut tx).await?;
|
|
||||||
|
|
||||||
tx.save().await?;
|
tx.save().await?;
|
||||||
break_all_dependents_transitive(db, id, transitive_error, breakages).await?;
|
break_all_dependents_transitive(db, id, DependencyError::Transitive, breakages).await?;
|
||||||
} else {
|
|
||||||
status.save(&mut tx).await?;
|
|
||||||
|
|
||||||
tx.save().await?;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
status.save(&mut tx).await?;
|
status.save(&mut tx).await?;
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::sync::atomic::AtomicBool;
|
use std::sync::atomic::{AtomicBool, Ordering};
|
||||||
|
|
||||||
use patch_db::DbHandle;
|
use patch_db::DbHandle;
|
||||||
use tracing::instrument;
|
use tracing::instrument;
|
||||||
@@ -20,51 +20,83 @@ pub async fn check<Db: DbHandle>(
|
|||||||
) -> Result<(), Error> {
|
) -> Result<(), Error> {
|
||||||
let mut tx = db.begin().await?;
|
let mut tx = db.begin().await?;
|
||||||
|
|
||||||
|
let mut checkpoint = tx.begin().await?;
|
||||||
|
|
||||||
let installed_model = crate::db::DatabaseModel::new()
|
let installed_model = crate::db::DatabaseModel::new()
|
||||||
.package_data()
|
.package_data()
|
||||||
.idx_model(id)
|
.idx_model(id)
|
||||||
.expect(&mut tx)
|
.expect(&mut checkpoint)
|
||||||
.await?
|
.await?
|
||||||
.installed()
|
.installed()
|
||||||
.expect(&mut tx)
|
.expect(&mut checkpoint)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
let mut checkpoint = tx.begin().await?;
|
|
||||||
|
|
||||||
let manifest = installed_model
|
let manifest = installed_model
|
||||||
.clone()
|
.clone()
|
||||||
.manifest()
|
.manifest()
|
||||||
.get(&mut checkpoint, true)
|
.get(&mut checkpoint, true)
|
||||||
.await?;
|
.await?
|
||||||
|
.into_owned();
|
||||||
|
|
||||||
let mut status = installed_model
|
let started = installed_model
|
||||||
.clone()
|
.clone()
|
||||||
.status()
|
.status()
|
||||||
.get_mut(&mut checkpoint)
|
.main()
|
||||||
.await?;
|
.started()
|
||||||
|
.get(&mut checkpoint, true)
|
||||||
status
|
.await?
|
||||||
.main
|
.into_owned();
|
||||||
.check(&ctx, &mut checkpoint, &*manifest, should_commit)
|
|
||||||
.await?;
|
|
||||||
|
|
||||||
let failed = match &status.main {
|
|
||||||
MainStatus::Running { health, .. } => health.clone(),
|
|
||||||
MainStatus::BackingUp { health, .. } => health.clone(),
|
|
||||||
_ => BTreeMap::new(),
|
|
||||||
};
|
|
||||||
|
|
||||||
status.save(&mut checkpoint).await?;
|
|
||||||
|
|
||||||
checkpoint.save().await?;
|
checkpoint.save().await?;
|
||||||
|
|
||||||
|
let health_results = if let Some(started) = started {
|
||||||
|
manifest
|
||||||
|
.health_checks
|
||||||
|
.check_all(ctx, started, id, &manifest.version, &manifest.volumes)
|
||||||
|
.await?
|
||||||
|
} else {
|
||||||
|
return Ok(());
|
||||||
|
};
|
||||||
|
|
||||||
|
if !should_commit.load(Ordering::SeqCst) {
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut checkpoint = tx.begin().await?;
|
||||||
|
|
||||||
|
let mut status = crate::db::DatabaseModel::new()
|
||||||
|
.package_data()
|
||||||
|
.idx_model(id)
|
||||||
|
.expect(&mut checkpoint)
|
||||||
|
.await?
|
||||||
|
.installed()
|
||||||
|
.expect(&mut checkpoint)
|
||||||
|
.await?
|
||||||
|
.status()
|
||||||
|
.main()
|
||||||
|
.get_mut(&mut checkpoint)
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
match &mut *status {
|
||||||
|
MainStatus::Running { health, .. } => {
|
||||||
|
*health = health_results.clone();
|
||||||
|
}
|
||||||
|
_ => (),
|
||||||
|
}
|
||||||
|
|
||||||
|
status.save(&mut checkpoint).await?;
|
||||||
|
|
||||||
let current_dependents = installed_model
|
let current_dependents = installed_model
|
||||||
.current_dependents()
|
.current_dependents()
|
||||||
.get(&mut tx, true)
|
.get(&mut checkpoint, true)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
|
checkpoint.save().await?;
|
||||||
|
|
||||||
for (dependent, info) in &*current_dependents {
|
for (dependent, info) in &*current_dependents {
|
||||||
let failures: BTreeMap<HealthCheckId, HealthCheckResult> = failed
|
let failures: BTreeMap<HealthCheckId, HealthCheckResult> = health_results
|
||||||
.iter()
|
.iter()
|
||||||
|
.filter(|(_, hc_res)| !matches!(hc_res, HealthCheckResult::Success { .. }))
|
||||||
.filter(|(hc_id, _)| info.health_checks.contains(hc_id))
|
.filter(|(hc_id, _)| info.health_checks.contains(hc_id))
|
||||||
.map(|(k, v)| (k.clone(), v.clone()))
|
.map(|(k, v)| (k.clone(), v.clone()))
|
||||||
.collect();
|
.collect();
|
||||||
|
|||||||
@@ -68,26 +68,12 @@ impl HealthChecks {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
|
|
||||||
#[serde(rename_all = "kebab-case")]
|
|
||||||
pub enum HealthCheckSeverity {
|
|
||||||
Warning,
|
|
||||||
Critical,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Default for HealthCheckSeverity {
|
|
||||||
fn default() -> Self {
|
|
||||||
HealthCheckSeverity::Warning
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||||
pub struct HealthCheck {
|
pub struct HealthCheck {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub description: String,
|
pub description: String,
|
||||||
#[serde(flatten)]
|
#[serde(flatten)]
|
||||||
implementation: ActionImplementation,
|
implementation: ActionImplementation,
|
||||||
#[serde(default)]
|
|
||||||
pub severity: HealthCheckSeverity,
|
|
||||||
pub timeout: Option<Duration>,
|
pub timeout: Option<Duration>,
|
||||||
}
|
}
|
||||||
impl HealthCheck {
|
impl HealthCheck {
|
||||||
|
|||||||
@@ -1,24 +1,19 @@
|
|||||||
use std::collections::BTreeMap;
|
use std::collections::BTreeMap;
|
||||||
use std::sync::atomic::{AtomicBool, Ordering};
|
|
||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use patch_db::{DbHandle, HasModel};
|
use patch_db::{HasModel, Model};
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use tracing::instrument;
|
|
||||||
|
|
||||||
use self::health_check::{HealthCheckId, HealthCheckSeverity};
|
use self::health_check::HealthCheckId;
|
||||||
use crate::context::RpcContext;
|
|
||||||
use crate::dependencies::DependencyErrors;
|
use crate::dependencies::DependencyErrors;
|
||||||
use crate::notifications::NotificationLevel;
|
|
||||||
use crate::s9pk::manifest::Manifest;
|
|
||||||
use crate::status::health_check::HealthCheckResult;
|
use crate::status::health_check::HealthCheckResult;
|
||||||
use crate::Error;
|
|
||||||
|
|
||||||
pub mod health_check;
|
pub mod health_check;
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize, HasModel)]
|
#[derive(Clone, Debug, Deserialize, Serialize, HasModel)]
|
||||||
#[serde(rename_all = "kebab-case")]
|
#[serde(rename_all = "kebab-case")]
|
||||||
pub struct Status {
|
pub struct Status {
|
||||||
pub configured: bool,
|
pub configured: bool,
|
||||||
|
#[model]
|
||||||
pub main: MainStatus,
|
pub main: MainStatus,
|
||||||
#[model]
|
#[model]
|
||||||
pub dependency_errors: DependencyErrors,
|
pub dependency_errors: DependencyErrors,
|
||||||
@@ -41,66 +36,6 @@ pub enum MainStatus {
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
impl MainStatus {
|
impl MainStatus {
|
||||||
#[instrument(skip(ctx, db, manifest))]
|
|
||||||
pub async fn check<Db: DbHandle>(
|
|
||||||
&mut self,
|
|
||||||
ctx: &RpcContext,
|
|
||||||
db: &mut Db,
|
|
||||||
manifest: &Manifest,
|
|
||||||
should_commit: &AtomicBool,
|
|
||||||
) -> Result<(), Error> {
|
|
||||||
match self {
|
|
||||||
MainStatus::Running { started, health } => {
|
|
||||||
let health_result = manifest
|
|
||||||
.health_checks
|
|
||||||
.check_all(
|
|
||||||
ctx,
|
|
||||||
*started,
|
|
||||||
&manifest.id,
|
|
||||||
&manifest.version,
|
|
||||||
&manifest.volumes,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
if !should_commit.load(Ordering::SeqCst) {
|
|
||||||
return Ok(());
|
|
||||||
} else {
|
|
||||||
// only commit health check results if we are supposed to
|
|
||||||
*health = health_result;
|
|
||||||
}
|
|
||||||
let mut should_stop = false;
|
|
||||||
for (check, res) in health {
|
|
||||||
match &res {
|
|
||||||
health_check::HealthCheckResult::Failure { error }
|
|
||||||
if manifest
|
|
||||||
.health_checks
|
|
||||||
.0
|
|
||||||
.get(check)
|
|
||||||
.map(|hc| hc.severity == HealthCheckSeverity::Critical)
|
|
||||||
.unwrap_or_default() =>
|
|
||||||
{
|
|
||||||
ctx.notification_manager.notify(
|
|
||||||
db,
|
|
||||||
Some(manifest.id.clone()),
|
|
||||||
NotificationLevel::Error,
|
|
||||||
String::from("Critical Health Check Failed"),
|
|
||||||
format!("{} was shut down because a health check required for its operation failed\n{}", manifest.title, error),
|
|
||||||
(),
|
|
||||||
None,
|
|
||||||
)
|
|
||||||
.await?;
|
|
||||||
should_stop = true;
|
|
||||||
}
|
|
||||||
_ => (),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if should_stop {
|
|
||||||
*self = MainStatus::Stopping;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => (),
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
pub fn running(&self) -> bool {
|
pub fn running(&self) -> bool {
|
||||||
match self {
|
match self {
|
||||||
MainStatus::Starting
|
MainStatus::Starting
|
||||||
@@ -125,3 +60,8 @@ impl MainStatus {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
impl MainStatusModel {
|
||||||
|
pub fn started(self) -> Model<Option<DateTime<Utc>>> {
|
||||||
|
self.0.child("started")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user