mirror of
https://github.com/cgrates/cgrates.git
synced 2026-02-11 10:06:24 +05:00
prometheus: add cache statistics metrics
This commit is contained in:
committed by
Dan Christian Bogos
parent
69f9120bf3
commit
e70908356e
@@ -27,6 +27,7 @@ import (
|
||||
"github.com/cgrates/cgrates/cores"
|
||||
"github.com/cgrates/cgrates/engine"
|
||||
"github.com/cgrates/cgrates/utils"
|
||||
"github.com/cgrates/ltcache"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/collectors"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
@@ -142,8 +143,10 @@ type PrometheusAgent struct {
|
||||
cfg *config.CGRConfig
|
||||
cm *engine.ConnManager
|
||||
|
||||
handler http.Handler
|
||||
statMetrics *prometheus.GaugeVec
|
||||
handler http.Handler
|
||||
statMetrics *prometheus.GaugeVec
|
||||
cacheGroupsMetric *prometheus.GaugeVec
|
||||
cacheItemsMetric *prometheus.GaugeVec
|
||||
}
|
||||
|
||||
// NewPrometheusAgent creates and initializes a PrometheusAgent with
|
||||
@@ -156,6 +159,23 @@ func NewPrometheusAgent(cfg *config.CGRConfig, cm *engine.ConnManager) *Promethe
|
||||
reg.MustRegister(coreMetricsCollector)
|
||||
}
|
||||
|
||||
cacheGroupsMetric := prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "cgrates",
|
||||
Subsystem: "cache",
|
||||
Name: "groups_total",
|
||||
Help: "Total number of cache groups",
|
||||
}, []string{"cache"})
|
||||
reg.MustRegister(cacheGroupsMetric)
|
||||
|
||||
cacheItemsMetric := prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "cgrates",
|
||||
Subsystem: "cache",
|
||||
Name: "items_total",
|
||||
Help: "Total number of cache items",
|
||||
}, []string{"cache"})
|
||||
reg.MustRegister(cacheItemsMetric)
|
||||
statMetrics := prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: "cgrates",
|
||||
@@ -177,16 +197,19 @@ func NewPrometheusAgent(cfg *config.CGRConfig, cm *engine.ConnManager) *Promethe
|
||||
)
|
||||
|
||||
return &PrometheusAgent{
|
||||
cfg: cfg,
|
||||
cm: cm,
|
||||
handler: handler,
|
||||
statMetrics: statMetrics,
|
||||
cfg: cfg,
|
||||
cm: cm,
|
||||
handler: handler,
|
||||
statMetrics: statMetrics,
|
||||
cacheGroupsMetric: cacheGroupsMetric,
|
||||
cacheItemsMetric: cacheItemsMetric,
|
||||
}
|
||||
}
|
||||
|
||||
// ServeHTTP implements http.Handler interface. It updates all metrics on each
|
||||
// scrape request before exposing them via the Prometheus HTTP handler.
|
||||
func (pa *PrometheusAgent) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
pa.updateCacheStats()
|
||||
pa.updateStatsMetrics()
|
||||
pa.handler.ServeHTTP(w, r)
|
||||
}
|
||||
@@ -225,6 +248,35 @@ func (pa *PrometheusAgent) updateStatsMetrics() {
|
||||
}
|
||||
}
|
||||
|
||||
// updateCacheStats fetches cache statistics from configured CacheS connections
|
||||
// and updates the corresponding Prometheus metrics.
|
||||
func (pa *PrometheusAgent) updateCacheStats() {
|
||||
if len(pa.cfg.PrometheusAgentCfg().CacheSConns) == 0 {
|
||||
return
|
||||
}
|
||||
for _, connID := range pa.cfg.PrometheusAgentCfg().CacheSConns {
|
||||
var cacheStats map[string]*ltcache.CacheStats
|
||||
if err := pa.cm.Call(context.Background(), []string{connID},
|
||||
utils.CacheSv1GetCacheStats,
|
||||
&utils.AttrCacheIDsWithAPIOpts{
|
||||
CacheIDs: pa.cfg.PrometheusAgentCfg().CacheIDs,
|
||||
}, &cacheStats); err != nil {
|
||||
utils.Logger.Err(fmt.Sprintf(
|
||||
"<%s> failed to retrieve cache stats (connID=%q): %v",
|
||||
utils.PrometheusAgent, connID, err))
|
||||
continue
|
||||
}
|
||||
|
||||
for cacheID, stats := range cacheStats {
|
||||
if stats == nil {
|
||||
continue
|
||||
}
|
||||
pa.cacheGroupsMetric.WithLabelValues(cacheID).Set(float64(stats.Groups))
|
||||
pa.cacheItemsMetric.WithLabelValues(cacheID).Set(float64(stats.Items))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// coreMetricsCollector collects CoreS metrics. Equivalent to Go/Process collectors.
|
||||
type coreMetricsCollector struct {
|
||||
cfg *config.CGRConfig
|
||||
|
||||
@@ -840,11 +840,11 @@ const CGRATES_CFG_JSON = `
|
||||
"prometheus_agent": {
|
||||
"enabled": false, // enables the prometheus agent: <true|false>
|
||||
"path": "/prometheus", // endpoint for prometheus metrics
|
||||
"collect_go_metrics": false, // include Go runtime metrics (memory, GC, goroutines)
|
||||
"collect_process_metrics": false, // include process metrics (CPU, file descriptors)
|
||||
// "cores_conns": [], // connections to CoreS, empty to disable: <""|*internal|$rpc_conns_id>
|
||||
// "stats_conns": [], // connections to StatS, empty to disable: <""|*internal|$rpc_conns_id>
|
||||
// "stat_queue_ids": [] // StatQueue IDs to collect metrics from <[tenant]:ID>
|
||||
"caches_conns": [], // connections to CacheS, empty to disable: <""|*internal|$rpc_conns_id>
|
||||
"cache_ids": [], // cache partition IDs to collect statistics for, empty for all partitions
|
||||
"cores_conns": [], // connections to CoreS, empty to disable: <""|*internal|$rpc_conns_id>
|
||||
"stats_conns": [], // connections to StatS, empty to disable: <""|*internal|$rpc_conns_id>
|
||||
"stat_queue_ids": [] // StatQueue IDs to collect metrics from <[tenant]:ID>
|
||||
},
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -30,6 +30,8 @@ type PrometheusAgentJsonCfg struct {
|
||||
Path *string `json:"path"`
|
||||
CollectGoMetrics *bool `json:"collect_go_metrics"`
|
||||
CollectProcessMetrics *bool `json:"collect_process_metrics"`
|
||||
CacheSConns *[]string `json:"caches_conns"`
|
||||
CacheIDs *[]string `json:"cache_ids"`
|
||||
CoreSConns *[]string `json:"cores_conns"`
|
||||
StatSConns *[]string `json:"stats_conns"`
|
||||
StatQueueIDs *[]string `json:"stat_queue_ids"`
|
||||
@@ -41,6 +43,8 @@ type PrometheusAgentCfg struct {
|
||||
Path string
|
||||
CollectGoMetrics bool
|
||||
CollectProcessMetrics bool
|
||||
CacheSConns []string
|
||||
CacheIDs []string
|
||||
CoreSConns []string
|
||||
StatSConns []string
|
||||
StatQueueIDs []string
|
||||
@@ -62,6 +66,12 @@ func (c *PrometheusAgentCfg) loadFromJSONCfg(jc *PrometheusAgentJsonCfg) error {
|
||||
if jc.CollectProcessMetrics != nil {
|
||||
c.CollectProcessMetrics = *jc.CollectProcessMetrics
|
||||
}
|
||||
if jc.CacheSConns != nil {
|
||||
c.CacheSConns = tagInternalConns(*jc.CacheSConns, utils.MetaCaches)
|
||||
}
|
||||
if jc.CacheIDs != nil {
|
||||
c.CacheIDs = *jc.CacheIDs
|
||||
}
|
||||
if jc.CoreSConns != nil {
|
||||
c.CoreSConns = tagInternalConns(*jc.CoreSConns, utils.MetaCore)
|
||||
}
|
||||
@@ -81,6 +91,8 @@ func (c PrometheusAgentCfg) AsMapInterface() any {
|
||||
utils.PathCfg: c.Path,
|
||||
utils.CollectGoMetricsCfg: c.CollectGoMetrics,
|
||||
utils.CollectProcessMetricsCfg: c.CollectProcessMetrics,
|
||||
utils.CacheSConnsCfg: stripInternalConns(c.CacheSConns),
|
||||
utils.CacheIDsCfg: stripInternalConns(c.CacheIDs),
|
||||
utils.CoreSConnsCfg: stripInternalConns(c.CoreSConns),
|
||||
utils.StatSConnsCfg: stripInternalConns(c.StatSConns),
|
||||
utils.StatQueueIDsCfg: c.StatQueueIDs,
|
||||
@@ -94,6 +106,8 @@ func (c PrometheusAgentCfg) Clone() *PrometheusAgentCfg {
|
||||
Path: c.Path,
|
||||
CollectGoMetrics: c.CollectGoMetrics,
|
||||
CollectProcessMetrics: c.CollectProcessMetrics,
|
||||
CacheSConns: slices.Clone(c.CacheSConns),
|
||||
CacheIDs: slices.Clone(c.CacheIDs),
|
||||
CoreSConns: slices.Clone(c.CoreSConns),
|
||||
StatSConns: slices.Clone(c.StatSConns),
|
||||
StatQueueIDs: slices.Clone(c.StatQueueIDs),
|
||||
|
||||
@@ -7,8 +7,9 @@ PrometheusAgent
|
||||
|
||||
1. **Core metrics** - collected from configured CGRateS engines via CoreSv1.Status API
|
||||
2. **StatQueue metrics** - values from CGRateS :ref:`StatS <stats>` component, collected via StatSv1.GetQueueFloatMetrics API
|
||||
3. **Cache statistics** - collected from configured :ref:`CacheS <caches>` components via CacheSv1.GetCacheStats API
|
||||
|
||||
For core metrics, the agent computes real-time values on each Prometheus scrape request. For StatQueue metrics, it retrieves the current state of the stored StatQueues without additional calculations.
|
||||
For core metrics, the agent computes real-time values on each Prometheus scrape request. For StatQueue metrics, it retrieves the current state of the stored StatQueues without additional calculations. For cache statistics, it collects current cache utilization data from the configured cache partitions.
|
||||
|
||||
Configuration
|
||||
-------------
|
||||
@@ -20,6 +21,12 @@ Example configuration in the JSON file:
|
||||
"prometheus_agent": {
|
||||
"enabled": true,
|
||||
"path": "/prometheus",
|
||||
"caches_conns": ["*internal"],
|
||||
"cache_ids": [
|
||||
"*attribute_filter_indexes",
|
||||
"*charger_profiles",
|
||||
"*rpc_connections"
|
||||
],
|
||||
"cores_conns": ["*internal", "external"],
|
||||
"stats_conns": ["*internal", "external"],
|
||||
"stat_queue_ids": ["cgrates.org:SQ_1", "SQ_2"]
|
||||
@@ -36,6 +43,12 @@ enabled
|
||||
path
|
||||
HTTP endpoint path where Prometheus metrics will be exposed, e.g., "/prometheus" or "/metrics"
|
||||
|
||||
caches_conns
|
||||
List of connection IDs to CacheS components for collecting cache statistics. Empty list disables cache metrics collection. Possible values: <""|*internal|$rpc_conns_id>
|
||||
|
||||
cache_ids
|
||||
List of cache partition IDs to collect statistics for. Available cache IDs can be found in the caches.partitions section of the default configuration. Empty list collects statistics for all available cache partitions.
|
||||
|
||||
cores_conns
|
||||
List of connection IDs to CoreS components for collecting core metrics. Empty list disables core metrics collection. Possible values: <""|*internal|$rpc_conns_id>
|
||||
|
||||
@@ -89,6 +102,28 @@ The PrometheusAgent exposes the following metrics:
|
||||
# TYPE go_memstats_alloc_bytes gauge
|
||||
go_memstats_alloc_bytes{node_id="e94160b"} 1.1360808e+07
|
||||
|
||||
3. **Cache Metrics** (when caches_conns is configured)
|
||||
- Two separate metrics for cache statistics: ``cgrates_cache_groups_total`` and ``cgrates_cache_items_total`` with cache partition ID label
|
||||
- Obtained from CacheS services on each scrape request
|
||||
- Useful for identifying memory usage patterns and potential performance issues
|
||||
|
||||
Example of cache metrics output:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
# HELP cgrates_cache_groups_total Total number of cache groups
|
||||
# TYPE cgrates_cache_groups_total gauge
|
||||
cgrates_cache_groups_total{cache="*attribute_filter_indexes"} 2
|
||||
cgrates_cache_groups_total{cache="*charger_profiles"} 0
|
||||
cgrates_cache_groups_total{cache="*rpc_connections"} 0
|
||||
|
||||
# HELP cgrates_cache_items_total Total number of cache items
|
||||
# TYPE cgrates_cache_items_total gauge
|
||||
cgrates_cache_items_total{cache="*attribute_filter_indexes"} 6
|
||||
cgrates_cache_items_total{cache="*charger_profiles"} 2
|
||||
cgrates_cache_items_total{cache="*rpc_connections"} 1
|
||||
|
||||
|
||||
How It Works
|
||||
------------
|
||||
|
||||
@@ -96,6 +131,7 @@ The PrometheusAgent operates differently than other CGRateS components that use
|
||||
|
||||
- When multiple connections are configured in stats_conns, the agent collects metrics from **all** connections, not just the first available one
|
||||
- When multiple connections are configured in cores_conns, the agent attempts to collect metrics from **all** connections, labeling them with their respective node_id
|
||||
- When multiple connections are configured in caches_conns, the agent collects cache statistics from **all** connections for the specified cache_ids
|
||||
- The agent processes metrics requests only when Prometheus sends a scrape request to the configured HTTP endpoint
|
||||
|
||||
You can view all exported metrics and see what Prometheus would scrape by making a simple curl request to the HTTP endpoint:
|
||||
|
||||
@@ -2596,6 +2596,7 @@ const (
|
||||
CoreSConnsCfg = "cores_conns"
|
||||
CollectGoMetricsCfg = "collect_go_metrics"
|
||||
CollectProcessMetricsCfg = "collect_process_metrics"
|
||||
CacheIDsCfg = "cache_ids"
|
||||
StatQueueIDsCfg = "stat_queue_ids"
|
||||
|
||||
// AttributeSCfg
|
||||
|
||||
Reference in New Issue
Block a user