prometheus: fetch all StatQueues when IDs list is empty

This commit is contained in:
ionutboangiu
2025-11-12 18:10:52 +02:00
committed by Dan Christian Bogos
parent d5f49ba1a2
commit e019aeabab
6 changed files with 82 additions and 45 deletions

View File

@@ -216,11 +216,23 @@ func (pa *PrometheusAgent) ServeHTTP(w http.ResponseWriter, r *http.Request) {
// updateStatsMetrics fetches and updates all StatQueue metrics by calling each
// configured StatS connection.
func (pa *PrometheusAgent) updateStatsMetrics() {
if len(pa.cfg.PrometheusAgentCfg().StatQueueIDs) == 0 {
return
}
for _, connID := range pa.cfg.PrometheusAgentCfg().StatSConns {
for _, sqID := range pa.cfg.PrometheusAgentCfg().StatQueueIDs {
for connIdx, connID := range pa.cfg.PrometheusAgentCfg().StatSConns {
sqIDs := pa.cfg.PrometheusAgentCfg().StatQueueIDs
// When no StatQueueIDs set, fetch all available ones.
if len(sqIDs) == 0 {
adminsConnID := pa.cfg.PrometheusAgentCfg().AdminSConns[connIdx]
if err := pa.cm.Call(context.Background(), []string{adminsConnID},
utils.AdminSv1GetStatQueueProfileIDs,
&utils.ArgsItemIDs{}, &sqIDs); err != nil {
utils.Logger.Err(fmt.Sprintf(
"<%s> failed to retrieve all StatQueue IDs (connID=%q): %v",
utils.PrometheusAgent, adminsConnID, err))
continue
}
}
for _, sqID := range sqIDs {
tenantID := utils.NewTenantID(sqID)
if tenantID.Tenant == "" {

View File

@@ -1089,6 +1089,7 @@ const CGRATES_CFG_JSON = `
"prometheus_agent": {
"enabled": false, // enables the prometheus agent: <true|false>
"path": "/prometheus", // endpoint for prometheus metrics
"admins_conns": [], // connections to AdminS, empty to disable: <""|*internal|$rpc_conns_id>
"caches_conns": [], // connections to CacheS, empty to disable: <""|*internal|$rpc_conns_id>
"cache_ids": [], // cache partition IDs to collect statistics for, empty for all partitions
"cores_conns": [], // connections to CoreS, empty to disable: <""|*internal|$rpc_conns_id>

View File

@@ -1253,8 +1253,46 @@ func (cfg *CGRConfig) checkConfigSanity() error {
return fmt.Errorf("<%s> the CleanupInterval needs to be bigger than 0", utils.AnalyzerS)
}
}
if err := cfg.prometheusAgentCfg.validate(cfg); err != nil {
return err
if cfg.prometheusAgentCfg.Enabled {
if len(cfg.prometheusAgentCfg.StatSConns) > 0 &&
len(cfg.prometheusAgentCfg.StatQueueIDs) == 0 &&
len(cfg.prometheusAgentCfg.StatSConns) != len(cfg.prometheusAgentCfg.AdminSConns) {
return fmt.Errorf(
"<%s> when StatQueueIDs is empty, admins_conns must match stats_conns length to fetch StatQueue IDs",
utils.PrometheusAgent)
}
for _, connID := range cfg.prometheusAgentCfg.AdminSConns {
if strings.HasPrefix(connID, utils.MetaInternal) && !cfg.admS.Enabled {
return fmt.Errorf("<%s> not enabled but requested by <%s> component", utils.AdminS, utils.PrometheusAgent)
}
if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) {
return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID)
}
}
for _, connID := range cfg.prometheusAgentCfg.CacheSConns {
if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) {
return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID)
}
}
for _, connID := range cfg.prometheusAgentCfg.CoreSConns {
if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) {
return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID)
}
}
for _, connID := range cfg.prometheusAgentCfg.StatSConns {
if strings.HasPrefix(connID, utils.MetaInternal) && !cfg.statsCfg.Enabled {
return fmt.Errorf("<%s> not enabled but requested by <%s> component", utils.StatService, utils.PrometheusAgent)
}
if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) {
return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID)
}
}
if len(cfg.prometheusAgentCfg.CoreSConns) > 0 {
if cfg.prometheusAgentCfg.CollectGoMetrics || cfg.prometheusAgentCfg.CollectProcessMetrics {
return fmt.Errorf("<%s> collect_go_metrics and collect_process_metrics cannot be enabled when using CoreSConns",
utils.PrometheusAgent)
}
}
}
return nil

View File

@@ -19,9 +19,7 @@ along with this program. If not, see <https://www.gnu.org/licenses/>
package config
import (
"fmt"
"slices"
"strings"
"github.com/cgrates/birpc/context"
"github.com/cgrates/cgrates/utils"
@@ -33,6 +31,7 @@ type PrometheusAgentJsonCfg struct {
Path *string `json:"path"`
CollectGoMetrics *bool `json:"collect_go_metrics"`
CollectProcessMetrics *bool `json:"collect_process_metrics"`
AdminSConns *[]string `json:"admins_conns"`
CacheSConns *[]string `json:"caches_conns"`
CacheIDs *[]string `json:"cache_ids"`
CoreSConns *[]string `json:"cores_conns"`
@@ -46,6 +45,7 @@ type PrometheusAgentCfg struct {
Path string
CollectGoMetrics bool
CollectProcessMetrics bool
AdminSConns []string
CacheSConns []string
CacheIDs []string
CoreSConns []string
@@ -78,6 +78,9 @@ func (c *PrometheusAgentCfg) loadFromJSONCfg(jc *PrometheusAgentJsonCfg) error {
if jc.CollectProcessMetrics != nil {
c.CollectProcessMetrics = *jc.CollectProcessMetrics
}
if jc.AdminSConns != nil {
c.AdminSConns = tagInternalConns(*jc.AdminSConns, utils.MetaAdminS)
}
if jc.CacheSConns != nil {
c.CacheSConns = tagInternalConns(*jc.CacheSConns, utils.MetaCaches)
}
@@ -103,6 +106,7 @@ func (c PrometheusAgentCfg) AsMapInterface() any {
utils.PathCfg: c.Path,
utils.CollectGoMetricsCfg: c.CollectGoMetrics,
utils.CollectProcessMetricsCfg: c.CollectProcessMetrics,
utils.AdminSConnsCfg: stripInternalConns(c.AdminSConns),
utils.CacheSConnsCfg: stripInternalConns(c.CacheSConns),
utils.CacheIDsCfg: stripInternalConns(c.CacheIDs),
utils.CoreSConnsCfg: stripInternalConns(c.CoreSConns),
@@ -121,6 +125,7 @@ func (c PrometheusAgentCfg) Clone() *PrometheusAgentCfg {
Path: c.Path,
CollectGoMetrics: c.CollectGoMetrics,
CollectProcessMetrics: c.CollectProcessMetrics,
AdminSConns: slices.Clone(c.AdminSConns),
CacheSConns: slices.Clone(c.CacheSConns),
CacheIDs: slices.Clone(c.CacheIDs),
CoreSConns: slices.Clone(c.CoreSConns),
@@ -129,37 +134,6 @@ func (c PrometheusAgentCfg) Clone() *PrometheusAgentCfg {
}
}
func (c PrometheusAgentCfg) validate(cfg *CGRConfig) error {
if !c.Enabled {
return nil
}
for _, connID := range cfg.prometheusAgentCfg.CacheSConns {
if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) {
return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID)
}
}
for _, connID := range cfg.prometheusAgentCfg.CoreSConns {
if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) {
return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID)
}
}
for _, connID := range c.StatSConns {
if strings.HasPrefix(connID, utils.MetaInternal) && !cfg.statsCfg.Enabled {
return fmt.Errorf("<%s> not enabled but requested by <%s> component", utils.StatService, utils.PrometheusAgent)
}
if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) {
return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID)
}
}
if len(c.CoreSConns) > 0 {
if c.CollectGoMetrics || c.CollectProcessMetrics {
return fmt.Errorf("<%s> collect_go_metrics and collect_process_metrics cannot be enabled when using CoreSConns",
utils.PrometheusAgent)
}
}
return nil
}
func diffPrometheusAgentJsonCfg(d *PrometheusAgentJsonCfg, v1, v2 *PrometheusAgentCfg) *PrometheusAgentJsonCfg {
if d == nil {
d = new(PrometheusAgentJsonCfg)
@@ -177,6 +151,9 @@ func diffPrometheusAgentJsonCfg(d *PrometheusAgentJsonCfg, v1, v2 *PrometheusAge
if v1.CollectProcessMetrics != v2.CollectProcessMetrics && true {
d.CollectProcessMetrics = utils.BoolPointer(v2.CollectProcessMetrics)
}
if !slices.Equal(v1.AdminSConns, v2.AdminSConns) {
d.AdminSConns = utils.SliceStringPointer(v2.AdminSConns)
}
if !slices.Equal(v1.CoreSConns, v2.CoreSConns) {
d.CoreSConns = utils.SliceStringPointer(v2.CoreSConns)
}

View File

@@ -21,6 +21,7 @@ Example configuration in the JSON file:
"prometheus_agent": {
"enabled": true,
"path": "/prometheus",
"apiers_conns": ["*internal", "external"],
"caches_conns": ["*internal"],
"cache_ids": [
"*attribute_filter_indexes",
@@ -43,6 +44,9 @@ enabled
path
HTTP endpoint path where Prometheus metrics will be exposed, e.g., "/prometheus" or "/metrics"
apiers_conns
List of connection IDs to ApierS components. Required when stat_queue_ids is empty to fetch all available StatQueue profile IDs. Must match the length of stats_conns when auto-fetching is used. Possible values: <""|*internal|$rpc_conns_id>
caches_conns
List of connection IDs to CacheS components for collecting cache statistics. Empty list disables cache metrics collection. Possible values: <""|*internal|$rpc_conns_id>
@@ -56,7 +60,7 @@ stats_conns
List of connection IDs to StatS components for collecting StatQueue metrics. Empty list disables StatQueue metrics collection. Possible values: <""|*internal|$rpc_conns_id>
stat_queue_ids
List of StatQueue IDs to collect metrics from. Can include tenant in format <[tenant]:ID>. If tenant is not specified, default tenant from general configuration is used.
List of StatQueue IDs to collect metrics from. Can include tenant in format <[tenant]:ID>. If tenant is not specified, default tenant from general configuration is used. Leave empty to automatically collect metrics from all available StatQueues (requires apiers_conns).
Available Metrics
-----------------
@@ -135,6 +139,11 @@ The PrometheusAgent operates differently than other CGRateS components that use
- When multiple connections are configured in caches_conns, the agent collects cache statistics from **all** connections for the specified cache_ids
- The agent processes metrics requests only when Prometheus sends a scrape request to the configured HTTP endpoint
StatQueue metrics are collected based on the ``stat_queue_ids`` configuration. When specific StatQueue IDs are provided, only those StatQueues are monitored. When ``stat_queue_ids`` is left empty, all available StatQueues are monitored by fetching StatQueue profile IDs from the configured ``apiers_conns``.
.. note::
When fetching all StatQueues (empty stat_queue_ids), each ApierS connection in ``apiers_conns`` corresponds to its StatS counterpart at the same index position in ``stats_conns``.
You can view all exported metrics and see what Prometheus would scrape by making a simple curl request to the HTTP endpoint:
.. code-block:: bash

View File

@@ -24,7 +24,6 @@ import (
"bytes"
"fmt"
"io"
"math/rand"
"net/http"
"testing"
"time"
@@ -76,7 +75,8 @@ func TestPrometheusAgentIT(t *testing.T) {
"*stat_filter_indexes",
"*rpc_connections"
],
"stats_conns": ["*localhost", "external"],
// "apiers_conns": ["*internal", "external"],
"stats_conns": ["*internal", "external"],
"stat_queue_ids": ["cgrates.org:SQ_1","SQ_2"]
}
}`
@@ -169,8 +169,8 @@ func processStats(t *testing.T, client *birpc.Client) {
ID: utils.GenUUID(),
Event: map[string]any{},
APIOpts: map[string]any{
utils.MetaUsage: time.Duration(rand.Intn(3600)+60) * time.Second,
utils.MetaCost: rand.Float64()*20 + 0.1,
utils.MetaUsage: time.Duration(i) * time.Second,
utils.MetaCost: i * 10,
utils.OptsStatsProfileIDs: fmt.Sprintf("SQ_%d", i+1),
},
}, &reply); err != nil {