openzeppelin_relayer/metrics/
mod.rs

1//! Metrics module for the application.
2//!
3//! - This module contains the global Prometheus registry.
4//! - Defines specific metrics for the application.
5
6pub mod middleware;
7use lazy_static::lazy_static;
8use prometheus::{
9    CounterVec, Encoder, Gauge, GaugeVec, HistogramOpts, HistogramVec, Opts, Registry, TextEncoder,
10};
11
12// Stage labels for TRANSACTION_PROCESSING_TIME histogram.
13pub const STAGE_REQUEST_QUEUE_DWELL: &str = "request_queue_dwell";
14pub const STAGE_PREPARE_DURATION: &str = "prepare_duration";
15pub const STAGE_SUBMISSION_QUEUE_DWELL: &str = "submission_queue_dwell";
16pub const STAGE_SUBMIT_DURATION: &str = "submit_duration";
17
18/// Observe a duration on the `TRANSACTION_PROCESSING_TIME` histogram.
19pub fn observe_processing_time(relayer_id: &str, network_type: &str, stage: &str, secs: f64) {
20    TRANSACTION_PROCESSING_TIME
21        .with_label_values(&[relayer_id, network_type, stage])
22        .observe(secs);
23}
24
25/// Observe queue pickup latency (time from send to consumer pickup).
26pub fn observe_queue_pickup_latency(queue_type: &str, backend: &str, secs: f64) {
27    QUEUE_PICKUP_LATENCY
28        .with_label_values(&[queue_type, backend])
29        .observe(secs);
30}
31use sysinfo::{Disks, System};
32
33lazy_static! {
34    // Global Prometheus registry.
35    pub static ref REGISTRY: Registry = Registry::new();
36
37    // Counter: Total HTTP requests.
38    pub static ref REQUEST_COUNTER: CounterVec = {
39        let opts = Opts::new("requests_total", "Total number of HTTP requests");
40        let counter_vec = CounterVec::new(opts, &["endpoint", "method", "status"]).unwrap();
41        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
42        counter_vec
43    };
44
45    // Counter: Total HTTP requests by raw URI.
46    pub static ref RAW_REQUEST_COUNTER: CounterVec = {
47      let opts = Opts::new("raw_requests_total", "Total number of HTTP requests by raw URI");
48      let counter_vec = CounterVec::new(opts, &["raw_uri", "method", "status"]).unwrap();
49      REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
50      counter_vec
51    };
52
53    // Histogram for request latency in seconds.
54    pub static ref REQUEST_LATENCY: HistogramVec = {
55      let histogram_opts = HistogramOpts::new("request_latency_seconds", "Request latency in seconds")
56          .buckets(vec![0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 25.0, 50.0, 100.0]);
57      let histogram_vec = HistogramVec::new(histogram_opts, &["endpoint", "method", "status"]).unwrap();
58      REGISTRY.register(Box::new(histogram_vec.clone())).unwrap();
59      histogram_vec
60    };
61
62    // Counter for error responses.
63    pub static ref ERROR_COUNTER: CounterVec = {
64        let opts = Opts::new("error_requests_total", "Total number of error responses");
65        // Using "status" to record the HTTP status code (or a special label like "service_error")
66        let counter_vec = CounterVec::new(opts, &["endpoint", "method", "status"]).unwrap();
67        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
68        counter_vec
69    };
70
71    // Gauge for CPU usage percentage.
72    pub static ref CPU_USAGE: Gauge = {
73      let gauge = Gauge::new("cpu_usage_percentage", "Current CPU usage percentage").unwrap();
74      REGISTRY.register(Box::new(gauge.clone())).unwrap();
75      gauge
76    };
77
78    // Gauge for memory usage percentage.
79    pub static ref MEMORY_USAGE_PERCENT: Gauge = {
80      let gauge = Gauge::new("memory_usage_percentage", "Memory usage percentage").unwrap();
81      REGISTRY.register(Box::new(gauge.clone())).unwrap();
82      gauge
83    };
84
85    // Gauge for memory usage in bytes.
86    pub static ref MEMORY_USAGE: Gauge = {
87        let gauge = Gauge::new("memory_usage_bytes", "Memory usage in bytes").unwrap();
88        REGISTRY.register(Box::new(gauge.clone())).unwrap();
89        gauge
90    };
91
92    // Gauge for total memory in bytes.
93    pub static ref TOTAL_MEMORY: Gauge = {
94      let gauge = Gauge::new("total_memory_bytes", "Total memory in bytes").unwrap();
95      REGISTRY.register(Box::new(gauge.clone())).unwrap();
96      gauge
97    };
98
99    // Gauge for available memory in bytes.
100    pub static ref AVAILABLE_MEMORY: Gauge = {
101        let gauge = Gauge::new("available_memory_bytes", "Available memory in bytes").unwrap();
102        REGISTRY.register(Box::new(gauge.clone())).unwrap();
103        gauge
104    };
105
106    // Gauge for used disk space in bytes.
107    pub static ref DISK_USAGE: Gauge = {
108      let gauge = Gauge::new("disk_usage_bytes", "Used disk space in bytes").unwrap();
109      REGISTRY.register(Box::new(gauge.clone())).unwrap();
110      gauge
111    };
112
113    // Gauge for disk usage percentage.
114    pub static ref DISK_USAGE_PERCENT: Gauge = {
115      let gauge = Gauge::new("disk_usage_percentage", "Disk usage percentage").unwrap();
116      REGISTRY.register(Box::new(gauge.clone())).unwrap();
117      gauge
118    };
119
120    // Gauge for in-flight requests.
121    pub static ref IN_FLIGHT_REQUESTS: GaugeVec = {
122        let gauge_vec = GaugeVec::new(
123            Opts::new("in_flight_requests", "Number of in-flight requests"),
124            &["endpoint"]
125        ).unwrap();
126        REGISTRY.register(Box::new(gauge_vec.clone())).unwrap();
127        gauge_vec
128    };
129
130    // Counter for request timeouts.
131    pub static ref TIMEOUT_COUNTER: CounterVec = {
132        let opts = Opts::new("request_timeouts_total", "Total number of request timeouts");
133        let counter_vec = CounterVec::new(opts, &["endpoint", "method", "timeout_type"]).unwrap();
134        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
135        counter_vec
136    };
137
138    // Gauge for file descriptor count.
139    pub static ref FILE_DESCRIPTORS: Gauge = {
140        let gauge = Gauge::new("file_descriptors_count", "Current file descriptor count").unwrap();
141        REGISTRY.register(Box::new(gauge.clone())).unwrap();
142        gauge
143    };
144
145    // Gauge for CLOSE_WAIT socket count.
146    pub static ref CLOSE_WAIT_SOCKETS: Gauge = {
147        let gauge = Gauge::new("close_wait_sockets_count", "Number of CLOSE_WAIT sockets").unwrap();
148        REGISTRY.register(Box::new(gauge.clone())).unwrap();
149        gauge
150    };
151
152    // Counter for successful transactions (Confirmed status).
153    pub static ref TRANSACTIONS_SUCCESS: CounterVec = {
154        let opts = Opts::new("transactions_success_total", "Total number of successful transactions");
155        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
156        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
157        counter_vec
158    };
159
160    // Counter for failed transactions (Failed, Expired, Canceled statuses).
161    // Labels: relayer_id, network_type, failure_reason, previous_status.
162    // Note: `previous_status` label added to track the pipeline stage before the failure
163    // (e.g. "pending", "sent", "submitted"), enabling pre- vs post-submission attribution.
164    pub static ref TRANSACTIONS_FAILED: CounterVec = {
165        let opts = Opts::new("transactions_failed_total", "Total number of failed transactions");
166        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type", "failure_reason", "previous_status"]).unwrap();
167        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
168        counter_vec
169    };
170
171    // Counter for RPC failures during API requests (before transaction creation).
172    // This tracks failures that occur during operations like get_status, get_balance, etc.
173    // that happen before a transaction is created.
174    pub static ref API_RPC_FAILURES: CounterVec = {
175        let opts = Opts::new("api_rpc_failures_total", "Total number of RPC failures during API requests (before transaction creation)");
176        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type", "operation_name", "error_type"]).unwrap();
177        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
178        counter_vec
179    };
180
181    // Counter for transaction creation (when a transaction is successfully created in the repository).
182    pub static ref TRANSACTIONS_CREATED: CounterVec = {
183        let opts = Opts::new("transactions_created_total", "Total number of transactions created");
184        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
185        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
186        counter_vec
187    };
188
189    // Counter for transaction submissions (when status changes to Submitted).
190    pub static ref TRANSACTIONS_SUBMITTED: CounterVec = {
191        let opts = Opts::new("transactions_submitted_total", "Total number of transactions submitted to the network");
192        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
193        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
194        counter_vec
195    };
196
197    // Gauge for transaction status distribution (current count of transactions in each status).
198    pub static ref TRANSACTIONS_BY_STATUS: GaugeVec = {
199        let gauge_vec = GaugeVec::new(
200            Opts::new("transactions_by_status", "Current number of transactions by status"),
201            &["relayer_id", "network_type", "status"]
202        ).unwrap();
203        REGISTRY.register(Box::new(gauge_vec.clone())).unwrap();
204        gauge_vec
205    };
206
207    // Histogram for transaction processing times (creation to submission).
208    pub static ref TRANSACTION_PROCESSING_TIME: HistogramVec = {
209        let histogram_opts = HistogramOpts::new("transaction_processing_seconds", "Transaction processing time in seconds")
210            .buckets(vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0]);
211        let histogram_vec = HistogramVec::new(histogram_opts, &["relayer_id", "network_type", "stage"]).unwrap();
212        REGISTRY.register(Box::new(histogram_vec.clone())).unwrap();
213        histogram_vec
214    };
215
216    // Histogram for queue pickup latency (time from send to consumer pickup).
217    pub static ref QUEUE_PICKUP_LATENCY: HistogramVec = {
218        let histogram_opts = HistogramOpts::new("queue_pickup_latency_seconds", "Queue pickup latency in seconds (send to consumer pickup)")
219            .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0, 600.0]);
220        let histogram_vec = HistogramVec::new(histogram_opts, &["queue_type", "backend"]).unwrap();
221        REGISTRY.register(Box::new(histogram_vec.clone())).unwrap();
222        histogram_vec
223    };
224
225    // Histogram for RPC call latency.
226    pub static ref RPC_CALL_LATENCY: HistogramVec = {
227        let histogram_opts = HistogramOpts::new("rpc_call_latency_seconds", "RPC call latency in seconds")
228            .buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0]);
229        let histogram_vec = HistogramVec::new(histogram_opts, &["relayer_id", "network_type", "operation_name"]).unwrap();
230        REGISTRY.register(Box::new(histogram_vec.clone())).unwrap();
231        histogram_vec
232    };
233
234    // Counter for Stellar transaction submission failures with decoded result codes.
235    pub static ref STELLAR_SUBMISSION_FAILURES: CounterVec = {
236        let opts = Opts::new("stellar_submission_failures_total",
237            "Stellar transaction submission failures by status and result code");
238        let counter_vec = CounterVec::new(opts, &["submit_status", "result_code"]).unwrap();
239        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
240        counter_vec
241    };
242
243    // Counter for plugin calls (tracks requests to /api/v1/plugins/{plugin_id}/call endpoints).
244    pub static ref PLUGIN_CALLS: CounterVec = {
245        let opts = Opts::new("plugin_calls_total", "Total number of plugin calls");
246        let counter_vec = CounterVec::new(opts, &["plugin_id", "method", "status"]).unwrap();
247        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
248        counter_vec
249    };
250
251    // Counter for Stellar submit responses with TRY_AGAIN_LATER status.
252    pub static ref STELLAR_TRY_AGAIN_LATER: CounterVec = {
253        let opts = Opts::new(
254            "stellar_try_again_later_total",
255            "Total number of Stellar transaction submit responses with TRY_AGAIN_LATER"
256        );
257        let counter_vec = CounterVec::new(opts, &["relayer_id", "tx_status"]).unwrap();
258        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
259        counter_vec
260    };
261
262    // Counter for transactions confirmed after experiencing TRY_AGAIN_LATER.
263    pub static ref TRANSACTIONS_TRY_AGAIN_LATER_SUCCESS: CounterVec = {
264        let opts = Opts::new(
265            "transactions_try_again_later_success_total",
266            "Total number of transactions confirmed after experiencing TRY_AGAIN_LATER"
267        );
268        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
269        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
270        counter_vec
271    };
272
273    // Counter for transactions that failed after experiencing TRY_AGAIN_LATER.
274    pub static ref TRANSACTIONS_TRY_AGAIN_LATER_FAILED: CounterVec = {
275        let opts = Opts::new(
276            "transactions_try_again_later_failed_total",
277            "Total number of transactions that failed after experiencing TRY_AGAIN_LATER"
278        );
279        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
280        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
281        counter_vec
282    };
283
284    // Counter for transactions that encountered an insufficient fee error.
285    pub static ref TRANSACTIONS_INSUFFICIENT_FEE: CounterVec = {
286        let opts = Opts::new(
287            "transactions_insufficient_fee_total",
288            "Total number of transactions that encountered an insufficient fee error"
289        );
290        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
291        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
292        counter_vec
293    };
294
295    // Counter for transactions confirmed after experiencing insufficient fee.
296    pub static ref TRANSACTIONS_INSUFFICIENT_FEE_SUCCESS: CounterVec = {
297        let opts = Opts::new(
298            "transactions_insufficient_fee_success_total",
299            "Total number of transactions confirmed after experiencing insufficient fee"
300        );
301        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
302        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
303        counter_vec
304    };
305
306    // Counter for transactions that failed after experiencing insufficient fee.
307    pub static ref TRANSACTIONS_INSUFFICIENT_FEE_FAILED: CounterVec = {
308        let opts = Opts::new(
309            "transactions_insufficient_fee_failed_total",
310            "Total number of transactions that failed after experiencing insufficient fee"
311        );
312        let counter_vec = CounterVec::new(opts, &["relayer_id", "network_type"]).unwrap();
313        REGISTRY.register(Box::new(counter_vec.clone())).unwrap();
314        counter_vec
315    };
316}
317
318/// Gather all metrics and encode into the provided format.
319pub fn gather_metrics() -> Result<Vec<u8>, Box<dyn std::error::Error>> {
320    let encoder = TextEncoder::new();
321    let metric_families = REGISTRY.gather();
322    let mut buffer = Vec::new();
323    encoder.encode(&metric_families, &mut buffer)?;
324    Ok(buffer)
325}
326
327/// Get file descriptor count for current process.
328fn get_fd_count() -> Result<usize, std::io::Error> {
329    let pid = std::process::id();
330
331    #[cfg(target_os = "linux")]
332    {
333        let fd_dir = format!("/proc/{pid}/fd");
334        std::fs::read_dir(fd_dir).map(|entries| entries.count())
335    }
336
337    #[cfg(target_os = "macos")]
338    {
339        use std::process::Command;
340        let output = Command::new("lsof")
341            .args(["-p", &pid.to_string()])
342            .output()?;
343        let count = String::from_utf8_lossy(&output.stdout)
344            .lines()
345            .count()
346            .saturating_sub(1); // Subtract header line
347        Ok(count)
348    }
349
350    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
351    {
352        Ok(0) // Unsupported platform
353    }
354}
355
356/// Get CLOSE_WAIT socket count.
357fn get_close_wait_count() -> Result<usize, std::io::Error> {
358    #[cfg(any(target_os = "linux", target_os = "macos"))]
359    {
360        use std::process::Command;
361        let output = Command::new("sh")
362            .args(["-c", "netstat -an | grep CLOSE_WAIT | wc -l"])
363            .output()?;
364        let count = String::from_utf8_lossy(&output.stdout)
365            .trim()
366            .parse()
367            .unwrap_or(0);
368        Ok(count)
369    }
370
371    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
372    {
373        Ok(0) // Unsupported platform
374    }
375}
376
377/// Updates the system metrics for CPU and memory usage.
378pub fn update_system_metrics() {
379    let mut sys = System::new_all();
380    sys.refresh_all();
381
382    // Overall CPU usage.
383    let cpu_usage = sys.global_cpu_usage();
384    CPU_USAGE.set(cpu_usage as f64);
385
386    // Total memory (in bytes).
387    let total_memory = sys.total_memory();
388    TOTAL_MEMORY.set(total_memory as f64);
389
390    // Available memory (in bytes).
391    let available_memory = sys.available_memory();
392    AVAILABLE_MEMORY.set(available_memory as f64);
393
394    // Used memory (in bytes).
395    let memory_usage = sys.used_memory();
396    MEMORY_USAGE.set(memory_usage as f64);
397
398    // Calculate memory usage percentage
399    let memory_percentage = if total_memory > 0 {
400        (memory_usage as f64 / total_memory as f64) * 100.0
401    } else {
402        0.0
403    };
404    MEMORY_USAGE_PERCENT.set(memory_percentage);
405
406    // Calculate disk usage:
407    // Sum total space and available space across all disks.
408    let disks = Disks::new_with_refreshed_list();
409    let mut total_disk_space: u64 = 0;
410    let mut total_disk_available: u64 = 0;
411    for disk in disks.list() {
412        total_disk_space += disk.total_space();
413        total_disk_available += disk.available_space();
414    }
415    // Used disk space is total minus available ( in bytes).
416    let used_disk_space = total_disk_space.saturating_sub(total_disk_available);
417    DISK_USAGE.set(used_disk_space as f64);
418
419    // Calculate disk usage percentage.
420    let disk_percentage = if total_disk_space > 0 {
421        (used_disk_space as f64 / total_disk_space as f64) * 100.0
422    } else {
423        0.0
424    };
425    DISK_USAGE_PERCENT.set(disk_percentage);
426
427    // Update file descriptor count.
428    if let Ok(fd_count) = get_fd_count() {
429        FILE_DESCRIPTORS.set(fd_count as f64);
430    }
431
432    // Update CLOSE_WAIT socket count.
433    if let Ok(close_wait) = get_close_wait_count() {
434        CLOSE_WAIT_SOCKETS.set(close_wait as f64);
435    }
436}
437
438#[cfg(test)]
439mod actix_tests {
440    use super::*;
441    use actix_web::{
442        dev::{Service, ServiceRequest, ServiceResponse, Transform},
443        http, test, Error, HttpResponse,
444    };
445    use futures::future::{self};
446    use middleware::MetricsMiddleware;
447    use prometheus::proto::MetricFamily;
448    use std::{
449        pin::Pin,
450        task::{Context, Poll},
451    };
452
453    // Dummy service that always returns a successful response (HTTP 200 OK).
454    struct DummySuccessService;
455
456    impl Service<ServiceRequest> for DummySuccessService {
457        type Response = ServiceResponse;
458        type Error = Error;
459        type Future = Pin<Box<dyn future::Future<Output = Result<Self::Response, Self::Error>>>>;
460
461        fn poll_ready(&self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
462            Poll::Ready(Ok(()))
463        }
464
465        fn call(&self, req: ServiceRequest) -> Self::Future {
466            let resp = req.into_response(HttpResponse::Ok().finish());
467            Box::pin(async move { Ok(resp) })
468        }
469    }
470
471    // Dummy service that always returns an error.
472    struct DummyErrorService;
473
474    impl Service<ServiceRequest> for DummyErrorService {
475        type Response = ServiceResponse;
476        type Error = Error;
477        type Future = Pin<Box<dyn future::Future<Output = Result<Self::Response, Self::Error>>>>;
478
479        fn poll_ready(&self, _cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
480            Poll::Ready(Ok(()))
481        }
482
483        fn call(&self, _req: ServiceRequest) -> Self::Future {
484            Box::pin(async move { Err(actix_web::error::ErrorInternalServerError("dummy error")) })
485        }
486    }
487
488    // Helper function to find a metric family by name.
489    fn find_metric_family<'a>(
490        name: &str,
491        families: &'a [MetricFamily],
492    ) -> Option<&'a MetricFamily> {
493        families.iter().find(|mf| mf.name() == name)
494    }
495
496    #[actix_rt::test]
497    async fn test_gather_metrics_contains_expected_names() {
498        // Update system metrics
499        update_system_metrics();
500
501        // Increment request counters to ensure they appear in output
502        REQUEST_COUNTER
503            .with_label_values(&["/test", "GET", "200"])
504            .inc();
505        RAW_REQUEST_COUNTER
506            .with_label_values(&["/test?param=value", "GET", "200"])
507            .inc();
508        REQUEST_LATENCY
509            .with_label_values(&["/test", "GET", "200"])
510            .observe(0.1);
511        ERROR_COUNTER
512            .with_label_values(&["/test", "GET", "500"])
513            .inc();
514
515        // Touch insufficient fee metrics to ensure they appear in output
516        TRANSACTIONS_INSUFFICIENT_FEE
517            .with_label_values(&["test-relayer", "stellar"])
518            .inc();
519        TRANSACTIONS_INSUFFICIENT_FEE_SUCCESS
520            .with_label_values(&["test-relayer", "stellar"])
521            .inc();
522        TRANSACTIONS_INSUFFICIENT_FEE_FAILED
523            .with_label_values(&["test-relayer", "stellar"])
524            .inc();
525
526        // Touch TRY_AGAIN_LATER metrics to ensure they appear in output
527        TRANSACTIONS_TRY_AGAIN_LATER_SUCCESS
528            .with_label_values(&["test-relayer", "stellar"])
529            .inc();
530        TRANSACTIONS_TRY_AGAIN_LATER_FAILED
531            .with_label_values(&["test-relayer", "stellar"])
532            .inc();
533
534        // Queue pickup latency
535        observe_queue_pickup_latency("transaction-request", "sqs", 0.5);
536
537        let metrics = gather_metrics().expect("failed to gather metrics");
538        let output = String::from_utf8(metrics).expect("metrics output is not valid UTF-8");
539
540        // System metrics
541        assert!(output.contains("cpu_usage_percentage"));
542        assert!(output.contains("memory_usage_percentage"));
543        assert!(output.contains("memory_usage_bytes"));
544        assert!(output.contains("total_memory_bytes"));
545        assert!(output.contains("available_memory_bytes"));
546        assert!(output.contains("disk_usage_bytes"));
547        assert!(output.contains("disk_usage_percentage"));
548
549        // Request metrics
550        assert!(output.contains("requests_total"));
551        assert!(output.contains("raw_requests_total"));
552        assert!(output.contains("request_latency_seconds"));
553        assert!(output.contains("error_requests_total"));
554
555        // Insufficient fee metrics
556        assert!(output.contains("transactions_insufficient_fee_total"));
557        assert!(output.contains("transactions_insufficient_fee_success_total"));
558        assert!(output.contains("transactions_insufficient_fee_failed_total"));
559
560        // TRY_AGAIN_LATER metrics
561        assert!(output.contains("transactions_try_again_later_success_total"));
562        assert!(output.contains("transactions_try_again_later_failed_total"));
563
564        // Queue pickup latency
565        assert!(output.contains("queue_pickup_latency_seconds"));
566    }
567
568    #[actix_rt::test]
569    async fn test_update_system_metrics() {
570        // Reset metrics to ensure clean state
571        CPU_USAGE.set(0.0);
572        TOTAL_MEMORY.set(0.0);
573        AVAILABLE_MEMORY.set(0.0);
574        MEMORY_USAGE.set(0.0);
575        MEMORY_USAGE_PERCENT.set(0.0);
576        DISK_USAGE.set(0.0);
577        DISK_USAGE_PERCENT.set(0.0);
578
579        // Call the function we're testing
580        update_system_metrics();
581
582        // Verify that metrics have been updated with reasonable values
583        let cpu_usage = CPU_USAGE.get();
584        assert!(
585            (0.0..=100.0).contains(&cpu_usage),
586            "CPU usage should be between 0-100%, got {cpu_usage}"
587        );
588
589        let memory_usage = MEMORY_USAGE.get();
590        assert!(
591            memory_usage >= 0.0,
592            "Memory usage should be >= 0, got {memory_usage}"
593        );
594
595        let memory_percent = MEMORY_USAGE_PERCENT.get();
596        assert!(
597            (0.0..=100.0).contains(&memory_percent),
598            "Memory usage percentage should be between 0-100%, got {memory_percent}"
599        );
600
601        let total_memory = TOTAL_MEMORY.get();
602        assert!(
603            total_memory > 0.0,
604            "Total memory should be > 0, got {total_memory}"
605        );
606
607        let available_memory = AVAILABLE_MEMORY.get();
608        assert!(
609            available_memory >= 0.0,
610            "Available memory should be >= 0, got {available_memory}"
611        );
612
613        let disk_usage = DISK_USAGE.get();
614        assert!(
615            disk_usage >= 0.0,
616            "Disk usage should be >= 0, got {disk_usage}"
617        );
618
619        let disk_percent = DISK_USAGE_PERCENT.get();
620        assert!(
621            (0.0..=100.0).contains(&disk_percent),
622            "Disk usage percentage should be between 0-100%, got {disk_percent}"
623        );
624
625        // Verify that memory usage doesn't exceed total memory
626        assert!(
627            memory_usage <= total_memory,
628            "Memory usage should be <= total memory, got {memory_usage}"
629        );
630
631        // Verify that available memory plus used memory doesn't exceed total memory
632        assert!(
633            (available_memory + memory_usage) <= total_memory,
634            "Available memory plus used memory should be <= total memory {}, got {}",
635            total_memory,
636            available_memory + memory_usage
637        );
638    }
639
640    #[actix_rt::test]
641    async fn test_middleware_success() {
642        let req = test::TestRequest::with_uri("/test_success").to_srv_request();
643
644        let middleware = MetricsMiddleware;
645        let service = middleware.new_transform(DummySuccessService).await.unwrap();
646
647        let resp = service.call(req).await.unwrap();
648        assert_eq!(resp.response().status(), http::StatusCode::OK);
649
650        let families = REGISTRY.gather();
651        let counter_fam = find_metric_family("requests_total", &families)
652            .expect("requests_total metric family not found");
653
654        let mut found = false;
655        for m in counter_fam.get_metric() {
656            let labels = m.get_label();
657            if labels
658                .iter()
659                .any(|l| l.name() == "endpoint" && l.value() == "/test_success")
660            {
661                found = true;
662                assert!(m.get_counter().value() >= 1.0);
663            }
664        }
665        assert!(
666            found,
667            "Expected metric with endpoint '/test_success' not found"
668        );
669    }
670
671    #[actix_rt::test]
672    async fn test_middleware_error() {
673        let req = test::TestRequest::with_uri("/test_error").to_srv_request();
674
675        let middleware = MetricsMiddleware;
676        let service = middleware.new_transform(DummyErrorService).await.unwrap();
677
678        let result = service.call(req).await;
679        assert!(result.is_err());
680
681        let families = REGISTRY.gather();
682        let error_counter_fam = find_metric_family("error_requests_total", &families)
683            .expect("error_requests_total metric family not found");
684
685        let mut found = false;
686        for m in error_counter_fam.get_metric() {
687            let labels = m.get_label();
688            if labels
689                .iter()
690                .any(|l| l.name() == "endpoint" && l.value() == "/test_error")
691            {
692                found = true;
693                assert!(m.get_counter().value() >= 1.0);
694            }
695        }
696        assert!(
697            found,
698            "Expected error metric with endpoint '/test_error' not found"
699        );
700    }
701}
702
703#[cfg(test)]
704mod property_tests {
705    use proptest::{prelude::*, test_runner::Config};
706
707    // A helper function to compute percentage used from total.
708    fn compute_percentage(used: u64, total: u64) -> f64 {
709        if total > 0 {
710            (used as f64 / total as f64) * 100.0
711        } else {
712            0.0
713        }
714    }
715
716    proptest! {
717        // Set the number of cases to 1000
718        #![proptest_config(Config {
719          cases: 1000, ..Config::default()
720        })]
721
722        #[test]
723        fn prop_compute_percentage((total, used) in {
724            (1u64..1_000_000u64).prop_flat_map(|total| {
725                (Just(total), 0u64..=total)
726            })
727        }) {
728            let percentage = compute_percentage(used, total);
729            prop_assert!(percentage >= 0.0);
730            prop_assert!(percentage <= 100.0);
731        }
732
733        #[test]
734        fn prop_labels_are_reasonable(
735              endpoint in ".*",
736              method in prop::sample::select(vec![
737                "GET".to_string(),
738                "POST".to_string(),
739                "PUT".to_string(),
740                "DELETE".to_string()
741                ])
742            ) {
743            let endpoint_label = if endpoint.is_empty() { "/".to_string() } else { endpoint.clone() };
744            let method_label = method;
745
746            prop_assert!(endpoint_label.chars().count() <= 1024, "Endpoint label too long");
747            prop_assert!(method_label.chars().count() <= 16, "Method label too long");
748
749            let status = "200".to_string();
750            let labels = vec![endpoint_label, method_label, status];
751
752            for label in labels {
753                prop_assert!(!label.is_empty());
754                prop_assert!(label.len() < 1024);
755            }
756        }
757    }
758}
759
760#[cfg(test)]
761mod processing_time_tests {
762    use super::*;
763
764    #[test]
765    fn test_observe_processing_time_records_to_histogram() {
766        let before = TRANSACTION_PROCESSING_TIME
767            .with_label_values(&["test-relayer", "evm", "request_queue_dwell"])
768            .get_sample_count();
769
770        observe_processing_time("test-relayer", "evm", "request_queue_dwell", 1.5);
771
772        let after = TRANSACTION_PROCESSING_TIME
773            .with_label_values(&["test-relayer", "evm", "request_queue_dwell"])
774            .get_sample_count();
775
776        assert_eq!(after, before + 1, "sample count should increase by 1");
777    }
778
779    #[test]
780    fn test_observe_processing_time_accumulates_sum() {
781        let label = "test_sum_stage";
782        let before_sum = TRANSACTION_PROCESSING_TIME
783            .with_label_values(&["test-relayer-sum", "stellar", label])
784            .get_sample_sum();
785
786        observe_processing_time("test-relayer-sum", "stellar", label, 2.0);
787        observe_processing_time("test-relayer-sum", "stellar", label, 3.0);
788
789        let after_sum = TRANSACTION_PROCESSING_TIME
790            .with_label_values(&["test-relayer-sum", "stellar", label])
791            .get_sample_sum();
792
793        let delta = after_sum - before_sum;
794        assert!(
795            (delta - 5.0).abs() < 0.001,
796            "sum should increase by 5.0, got delta {delta}"
797        );
798    }
799
800    #[test]
801    fn test_stage_constants_are_distinct() {
802        let stages = [
803            STAGE_REQUEST_QUEUE_DWELL,
804            STAGE_PREPARE_DURATION,
805            STAGE_SUBMISSION_QUEUE_DWELL,
806            STAGE_SUBMIT_DURATION,
807        ];
808        let unique: std::collections::HashSet<&str> = stages.iter().copied().collect();
809        assert_eq!(stages.len(), unique.len(), "stage constants must be unique");
810    }
811
812    #[test]
813    fn test_observe_queue_pickup_latency_records_to_histogram() {
814        let before = QUEUE_PICKUP_LATENCY
815            .with_label_values(&["notification", "sqs"])
816            .get_sample_count();
817
818        observe_queue_pickup_latency("notification", "sqs", 1.5);
819
820        let after = QUEUE_PICKUP_LATENCY
821            .with_label_values(&["notification", "sqs"])
822            .get_sample_count();
823
824        assert_eq!(after, before + 1, "sample count should increase by 1");
825    }
826
827    #[test]
828    fn test_observe_queue_pickup_latency_both_backends() {
829        for backend in &["sqs", "redis"] {
830            let before = QUEUE_PICKUP_LATENCY
831                .with_label_values(&["relayer-health-check", backend])
832                .get_sample_count();
833
834            observe_queue_pickup_latency("relayer-health-check", backend, 0.25);
835
836            let after = QUEUE_PICKUP_LATENCY
837                .with_label_values(&["relayer-health-check", backend])
838                .get_sample_count();
839
840            assert_eq!(
841                after,
842                before + 1,
843                "sample count should increase by 1 for backend {backend}"
844            );
845        }
846    }
847}