prometheus: add cache statistics metrics

This commit is contained in:
ionutboangiu
2025-07-15 21:33:14 +03:00
committed by Dan Christian Bogos
parent 7781961386
commit 71a1242522
8 changed files with 243 additions and 16 deletions

View File

@@ -27,6 +27,7 @@ import (
"github.com/cgrates/cgrates/cores"
"github.com/cgrates/cgrates/engine"
"github.com/cgrates/cgrates/utils"
"github.com/cgrates/ltcache"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
@@ -84,8 +85,10 @@ type PrometheusAgent struct {
cfg *config.CGRConfig
cm *engine.ConnManager
handler http.Handler
statMetrics *prometheus.GaugeVec
handler http.Handler
statMetrics *prometheus.GaugeVec
cacheGroupsMetric *prometheus.GaugeVec
cacheItemsMetric *prometheus.GaugeVec
}
// NewPrometheusAgent creates and initializes a PrometheusAgent with
@@ -98,6 +101,23 @@ func NewPrometheusAgent(cfg *config.CGRConfig, cm *engine.ConnManager) *Promethe
reg.MustRegister(coreMetricsCollector)
}
cacheGroupsMetric := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "cgrates",
Subsystem: "cache",
Name: "groups_total",
Help: "Total number of cache groups",
}, []string{"cache"})
reg.MustRegister(cacheGroupsMetric)
cacheItemsMetric := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "cgrates",
Subsystem: "cache",
Name: "items_total",
Help: "Total number of cache items",
}, []string{"cache"})
reg.MustRegister(cacheItemsMetric)
statMetrics := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "cgrates",
@@ -119,16 +139,19 @@ func NewPrometheusAgent(cfg *config.CGRConfig, cm *engine.ConnManager) *Promethe
)
return &PrometheusAgent{
cfg: cfg,
cm: cm,
handler: handler,
statMetrics: statMetrics,
cfg: cfg,
cm: cm,
handler: handler,
statMetrics: statMetrics,
cacheGroupsMetric: cacheGroupsMetric,
cacheItemsMetric: cacheItemsMetric,
}
}
// ServeHTTP implements http.Handler interface. It updates all metrics on each
// scrape request before exposing them via the Prometheus HTTP handler.
func (pa *PrometheusAgent) ServeHTTP(w http.ResponseWriter, r *http.Request) {
pa.updateCacheStats()
pa.updateStatsMetrics()
pa.handler.ServeHTTP(w, r)
}
@@ -167,6 +190,35 @@ func (pa *PrometheusAgent) updateStatsMetrics() {
}
}
// updateCacheStats fetches cache statistics from configured CacheS connections
// and updates the corresponding Prometheus metrics.
func (pa *PrometheusAgent) updateCacheStats() {
if len(pa.cfg.PrometheusAgentCfg().CacheSConns) == 0 {
return
}
for _, connID := range pa.cfg.PrometheusAgentCfg().CacheSConns {
var cacheStats map[string]*ltcache.CacheStats
if err := pa.cm.Call(context.Background(), []string{connID},
utils.CacheSv1GetCacheStats,
&utils.AttrCacheIDsWithAPIOpts{
CacheIDs: pa.cfg.PrometheusAgentCfg().CacheIDs,
}, &cacheStats); err != nil {
utils.Logger.Err(fmt.Sprintf(
"<%s> failed to retrieve cache stats (connID=%q): %v",
utils.PrometheusAgent, connID, err))
continue
}
for cacheID, stats := range cacheStats {
if stats == nil {
continue
}
pa.cacheGroupsMetric.WithLabelValues(cacheID).Set(float64(stats.Groups))
pa.cacheItemsMetric.WithLabelValues(cacheID).Set(float64(stats.Items))
}
}
}
// coreMetricsCollector collects CoreS metrics. Equivalent to Go/Process collectors.
type coreMetricsCollector struct {
cfg *config.CGRConfig

View File

@@ -1096,9 +1096,9 @@ const CGRATES_CFG_JSON = `
"prometheus_agent": {
"enabled": false, // enables the prometheus agent: <true|false>
"path": "/prometheus", // endpoint for prometheus metrics
"collect_go_metrics": false, // include Go runtime metrics (memory, GC, goroutines)
"collect_process_metrics": false, // include process metrics (CPU, file descriptors)
"cores_conns": [], // connections to CoreS, empty to disable: <""|*internal|$rpc_conns_id>
"caches_conns": [], // connections to CacheS, empty to disable: <""|*internal|$rpc_conns_id>
"cache_ids": [], // cache partition IDs to collect statistics for, empty for all partitions
"cores_conns": [], // connections to CoreS, empty to disable: <""|*internal|$rpc_conns_id>
"stats_conns": [], // connections to StatS, empty to disable: <""|*internal|$rpc_conns_id>
"stat_queue_ids": [] // StatQueue IDs to collect metrics from <[tenant]:ID>
},

File diff suppressed because one or more lines are too long

View File

@@ -33,6 +33,8 @@ type PrometheusAgentJsonCfg struct {
Path *string `json:"path"`
CollectGoMetrics *bool `json:"collect_go_metrics"`
CollectProcessMetrics *bool `json:"collect_process_metrics"`
CacheSConns *[]string `json:"caches_conns"`
CacheIDs *[]string `json:"cache_ids"`
CoreSConns *[]string `json:"cores_conns"`
StatSConns *[]string `json:"stats_conns"`
StatQueueIDs *[]string `json:"stat_queue_ids"`
@@ -44,6 +46,8 @@ type PrometheusAgentCfg struct {
Path string
CollectGoMetrics bool
CollectProcessMetrics bool
CacheSConns []string
CacheIDs []string
CoreSConns []string
StatSConns []string
StatQueueIDs []string
@@ -74,6 +78,12 @@ func (c *PrometheusAgentCfg) loadFromJSONCfg(jc *PrometheusAgentJsonCfg) error {
if jc.CollectProcessMetrics != nil {
c.CollectProcessMetrics = *jc.CollectProcessMetrics
}
if jc.CacheSConns != nil {
c.CacheSConns = tagInternalConns(*jc.CacheSConns, utils.MetaCaches)
}
if jc.CacheIDs != nil {
c.CacheIDs = *jc.CacheIDs
}
if jc.CoreSConns != nil {
c.CoreSConns = tagInternalConns(*jc.CoreSConns, utils.MetaCore)
}
@@ -93,6 +103,8 @@ func (c PrometheusAgentCfg) AsMapInterface() any {
utils.PathCfg: c.Path,
utils.CollectGoMetricsCfg: c.CollectGoMetrics,
utils.CollectProcessMetricsCfg: c.CollectProcessMetrics,
utils.CacheSConnsCfg: stripInternalConns(c.CacheSConns),
utils.CacheIDsCfg: stripInternalConns(c.CacheIDs),
utils.CoreSConnsCfg: stripInternalConns(c.CoreSConns),
utils.StatSConnsCfg: stripInternalConns(c.StatSConns),
utils.StatQueueIDsCfg: c.StatQueueIDs,
@@ -109,6 +121,8 @@ func (c PrometheusAgentCfg) Clone() *PrometheusAgentCfg {
Path: c.Path,
CollectGoMetrics: c.CollectGoMetrics,
CollectProcessMetrics: c.CollectProcessMetrics,
CacheSConns: slices.Clone(c.CacheSConns),
CacheIDs: slices.Clone(c.CacheIDs),
CoreSConns: slices.Clone(c.CoreSConns),
StatSConns: slices.Clone(c.StatSConns),
StatQueueIDs: slices.Clone(c.StatQueueIDs),

18
docs/agents.rst Normal file
View File

@@ -0,0 +1,18 @@
.. _agents:
Agents
======
CGRateS agents are interfaces towards external systems, implementing protocols enforced by the communication channels opened. They are designed to be flexible and configurable for both requests and replies.
Agents act as protocol translators and adapters, making CGRateS accessible to various external applications and monitoring systems. Most agents communicate primarily with SessionS, which coordinates with other core components.
Available Agents
----------------
.. toctree::
:maxdepth: 2
prometheus
diameter
radius

View File

@@ -15,4 +15,5 @@ Welcome to `CGRateS`_'s documentation!
configuration
core_concepts
components
agents
troubleshooting

141
docs/prometheus.rst Normal file
View File

@@ -0,0 +1,141 @@
.. _prometheus_agent:
PrometheusAgent
===============
**PrometheusAgent** is a CGRateS component that exposes metrics for Prometheus monitoring systems. It serves as a bridge between CGRateS and Prometheus by collecting and exposing metrics from:
1. **Core metrics** - collected from configured CGRateS engines via CoreSv1.Status API
2. **StatQueue metrics** - values from CGRateS :ref:`StatS <stats>` component, collected via StatSv1.GetQueueFloatMetrics API
3. **Cache statistics** - collected from configured :ref:`CacheS <caches>` components via CacheSv1.GetCacheStats API
For core metrics, the agent computes real-time values on each Prometheus scrape request. For StatQueue metrics, it retrieves the current state of the stored StatQueues without additional calculations. For cache statistics, it collects current cache utilization data from the configured cache partitions.
Configuration
-------------
Example configuration in the JSON file:
.. code-block:: json
"prometheus_agent": {
"enabled": true,
"path": "/prometheus",
"caches_conns": ["*internal"],
"cache_ids": [
"*attribute_filter_indexes",
"*charger_profiles",
"*rpc_connections"
],
"cores_conns": ["*internal", "external"],
"stats_conns": ["*internal", "external"],
"stat_queue_ids": ["cgrates.org:SQ_1", "SQ_2"]
}
The default configuration can be found in the :ref:`configuration` section.
Parameters
----------
enabled
Enable the PrometheusAgent module. Possible values: <true|false>
path
HTTP endpoint path where Prometheus metrics will be exposed, e.g., "/prometheus" or "/metrics"
caches_conns
List of connection IDs to CacheS components for collecting cache statistics. Empty list disables cache metrics collection. Possible values: <""|*internal|$rpc_conns_id>
cache_ids
List of cache partition IDs to collect statistics for. Available cache IDs can be found in the caches.partitions section of the default configuration. Empty list collects statistics for all available cache partitions.
cores_conns
List of connection IDs to CoreS components for collecting core metrics. Empty list disables core metrics collection. Possible values: <""|*internal|$rpc_conns_id>
stats_conns
List of connection IDs to StatS components for collecting StatQueue metrics. Empty list disables StatQueue metrics collection. Possible values: <""|*internal|$rpc_conns_id>
stat_queue_ids
List of StatQueue IDs to collect metrics from. Can include tenant in format <[tenant]:ID>. If tenant is not specified, default tenant from general configuration is used.
Available Metrics
-----------------
The PrometheusAgent exposes the following metrics:
1. **StatQueue Metrics**
- Uses the naming format ``cgrates_stats_metrics`` with labels for tenant, queue, and metric type
- Obtained from StatS services on each scrape request
Example of StatQueue metrics output:
.. code-block:: none
# HELP cgrates_stats_metrics Current values for StatQueue metrics
# TYPE cgrates_stats_metrics gauge
cgrates_stats_metrics{metric="*acc",queue="SQ_1",tenant="cgrates.org"} 7.73779
cgrates_stats_metrics{metric="*tcc",queue="SQ_1",tenant="cgrates.org"} 23.21337
cgrates_stats_metrics{metric="*acc",queue="SQ_2",tenant="cgrates.org"} 11.34716
cgrates_stats_metrics{metric="*tcc",queue="SQ_2",tenant="cgrates.org"} 34.04147
.. note::
StatQueue metrics don't include node_id labels since StatQueues can be shared between CGRateS instances. Users should ensure StatQueue IDs are unique across their environment.
2. **Core Metrics** (when cores_conns is configured)
- Standard Go runtime metrics (go_goroutines, go_memstats_*, etc.)
- Standard process metrics (process_cpu_seconds_total, process_open_fds, etc.)
- Node identification via "node_id" label, allowing multiple CGRateS engines to be monitored
Example of core metrics output:
.. code-block:: none
# HELP go_goroutines Number of goroutines that currently exist.
# TYPE go_goroutines gauge
go_goroutines{node_id="e94160b"} 40
# HELP process_cpu_seconds_total Total user and system CPU time spent in seconds.
# TYPE process_cpu_seconds_total counter
process_cpu_seconds_total{node_id="e94160b"} 0.34
# HELP go_memstats_alloc_bytes Number of bytes allocated in heap and currently in use.
# TYPE go_memstats_alloc_bytes gauge
go_memstats_alloc_bytes{node_id="e94160b"} 1.1360808e+07
3. **Cache Metrics** (when caches_conns is configured)
- Two separate metrics for cache statistics: ``cgrates_cache_groups_total`` and ``cgrates_cache_items_total`` with cache partition ID label
- Obtained from CacheS services on each scrape request
- Useful for identifying memory usage patterns and potential performance issues
Example of cache metrics output:
.. code-block:: none
# HELP cgrates_cache_groups_total Total number of cache groups
# TYPE cgrates_cache_groups_total gauge
cgrates_cache_groups_total{cache="*attribute_filter_indexes"} 2
cgrates_cache_groups_total{cache="*charger_profiles"} 0
cgrates_cache_groups_total{cache="*rpc_connections"} 0
# HELP cgrates_cache_items_total Total number of cache items
# TYPE cgrates_cache_items_total gauge
cgrates_cache_items_total{cache="*attribute_filter_indexes"} 6
cgrates_cache_items_total{cache="*charger_profiles"} 2
cgrates_cache_items_total{cache="*rpc_connections"} 1
How It Works
------------
The PrometheusAgent operates differently than other CGRateS components that use connection failover:
- When multiple connections are configured in stats_conns, the agent collects metrics from **all** connections, not just the first available one
- When multiple connections are configured in cores_conns, the agent attempts to collect metrics from **all** connections, labeling them with their respective node_id
- When multiple connections are configured in caches_conns, the agent collects cache statistics from **all** connections for the specified cache_ids
- The agent processes metrics requests only when Prometheus sends a scrape request to the configured HTTP endpoint
You can view all exported metrics and see what Prometheus would scrape by making a simple curl request to the HTTP endpoint:
.. code-block:: bash
curl http://localhost:2080/prometheus

View File

@@ -2267,12 +2267,6 @@ const (
TemplatesCfg = "templates"
RequestProcessorsCfg = "request_processors"
// PrometheusAgentCfg
CoreSConnsCfg = "cores_conns"
CollectGoMetricsCfg = "collect_go_metrics"
CollectProcessMetricsCfg = "collect_process_metrics"
StatQueueIDsCfg = "stat_queue_ids"
// RequestProcessor
RequestFieldsCfg = "request_fields"
ReplyFieldsCfg = "reply_fields"
@@ -2294,6 +2288,13 @@ const (
AdminAddressCfg = "admin_address"
AdminPasswordCfg = "admin_password"
// PrometheusAgentCfg
CoreSConnsCfg = "cores_conns"
CollectGoMetricsCfg = "collect_go_metrics"
CollectProcessMetricsCfg = "collect_process_metrics"
CacheIDsCfg = "cache_ids"
StatQueueIDsCfg = "stat_queue_ids"
// AttributeSCfg
IndexedSelectsCfg = "indexed_selects"
ProfileRunsCfg = "profile_runs"