diff --git a/agents/prometheus.go b/agents/prometheus.go index 0f180590f..2dd919965 100644 --- a/agents/prometheus.go +++ b/agents/prometheus.go @@ -216,11 +216,23 @@ func (pa *PrometheusAgent) ServeHTTP(w http.ResponseWriter, r *http.Request) { // updateStatsMetrics fetches and updates all StatQueue metrics by calling each // configured StatS connection. func (pa *PrometheusAgent) updateStatsMetrics() { - if len(pa.cfg.PrometheusAgentCfg().StatQueueIDs) == 0 { - return - } - for _, connID := range pa.cfg.PrometheusAgentCfg().StatSConns { - for _, sqID := range pa.cfg.PrometheusAgentCfg().StatQueueIDs { + for connIdx, connID := range pa.cfg.PrometheusAgentCfg().StatSConns { + sqIDs := pa.cfg.PrometheusAgentCfg().StatQueueIDs + + // When no StatQueueIDs set, fetch all available ones. + if len(sqIDs) == 0 { + adminsConnID := pa.cfg.PrometheusAgentCfg().AdminSConns[connIdx] + if err := pa.cm.Call(context.Background(), []string{adminsConnID}, + utils.AdminSv1GetStatQueueProfileIDs, + &utils.ArgsItemIDs{}, &sqIDs); err != nil { + utils.Logger.Err(fmt.Sprintf( + "<%s> failed to retrieve all StatQueue IDs (connID=%q): %v", + utils.PrometheusAgent, adminsConnID, err)) + continue + } + } + + for _, sqID := range sqIDs { tenantID := utils.NewTenantID(sqID) if tenantID.Tenant == "" { diff --git a/config/config_defaults.go b/config/config_defaults.go index 9f784abc7..709c730a4 100644 --- a/config/config_defaults.go +++ b/config/config_defaults.go @@ -1089,6 +1089,7 @@ const CGRATES_CFG_JSON = ` "prometheus_agent": { "enabled": false, // enables the prometheus agent: "path": "/prometheus", // endpoint for prometheus metrics + "admins_conns": [], // connections to AdminS, empty to disable: <""|*internal|$rpc_conns_id> "caches_conns": [], // connections to CacheS, empty to disable: <""|*internal|$rpc_conns_id> "cache_ids": [], // cache partition IDs to collect statistics for, empty for all partitions "cores_conns": [], // connections to CoreS, empty to disable: <""|*internal|$rpc_conns_id> diff --git a/config/configsanity.go b/config/configsanity.go index e71430d51..c9e1cec5d 100644 --- a/config/configsanity.go +++ b/config/configsanity.go @@ -1253,8 +1253,46 @@ func (cfg *CGRConfig) checkConfigSanity() error { return fmt.Errorf("<%s> the CleanupInterval needs to be bigger than 0", utils.AnalyzerS) } } - if err := cfg.prometheusAgentCfg.validate(cfg); err != nil { - return err + if cfg.prometheusAgentCfg.Enabled { + if len(cfg.prometheusAgentCfg.StatSConns) > 0 && + len(cfg.prometheusAgentCfg.StatQueueIDs) == 0 && + len(cfg.prometheusAgentCfg.StatSConns) != len(cfg.prometheusAgentCfg.AdminSConns) { + return fmt.Errorf( + "<%s> when StatQueueIDs is empty, admins_conns must match stats_conns length to fetch StatQueue IDs", + utils.PrometheusAgent) + } + for _, connID := range cfg.prometheusAgentCfg.AdminSConns { + if strings.HasPrefix(connID, utils.MetaInternal) && !cfg.admS.Enabled { + return fmt.Errorf("<%s> not enabled but requested by <%s> component", utils.AdminS, utils.PrometheusAgent) + } + if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) { + return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID) + } + } + for _, connID := range cfg.prometheusAgentCfg.CacheSConns { + if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) { + return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID) + } + } + for _, connID := range cfg.prometheusAgentCfg.CoreSConns { + if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) { + return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID) + } + } + for _, connID := range cfg.prometheusAgentCfg.StatSConns { + if strings.HasPrefix(connID, utils.MetaInternal) && !cfg.statsCfg.Enabled { + return fmt.Errorf("<%s> not enabled but requested by <%s> component", utils.StatService, utils.PrometheusAgent) + } + if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) { + return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID) + } + } + if len(cfg.prometheusAgentCfg.CoreSConns) > 0 { + if cfg.prometheusAgentCfg.CollectGoMetrics || cfg.prometheusAgentCfg.CollectProcessMetrics { + return fmt.Errorf("<%s> collect_go_metrics and collect_process_metrics cannot be enabled when using CoreSConns", + utils.PrometheusAgent) + } + } } return nil diff --git a/config/prometheus.go b/config/prometheus.go index 35fbfe0c1..e0bc87207 100644 --- a/config/prometheus.go +++ b/config/prometheus.go @@ -19,9 +19,7 @@ along with this program. If not, see package config import ( - "fmt" "slices" - "strings" "github.com/cgrates/birpc/context" "github.com/cgrates/cgrates/utils" @@ -33,6 +31,7 @@ type PrometheusAgentJsonCfg struct { Path *string `json:"path"` CollectGoMetrics *bool `json:"collect_go_metrics"` CollectProcessMetrics *bool `json:"collect_process_metrics"` + AdminSConns *[]string `json:"admins_conns"` CacheSConns *[]string `json:"caches_conns"` CacheIDs *[]string `json:"cache_ids"` CoreSConns *[]string `json:"cores_conns"` @@ -46,6 +45,7 @@ type PrometheusAgentCfg struct { Path string CollectGoMetrics bool CollectProcessMetrics bool + AdminSConns []string CacheSConns []string CacheIDs []string CoreSConns []string @@ -78,6 +78,9 @@ func (c *PrometheusAgentCfg) loadFromJSONCfg(jc *PrometheusAgentJsonCfg) error { if jc.CollectProcessMetrics != nil { c.CollectProcessMetrics = *jc.CollectProcessMetrics } + if jc.AdminSConns != nil { + c.AdminSConns = tagInternalConns(*jc.AdminSConns, utils.MetaAdminS) + } if jc.CacheSConns != nil { c.CacheSConns = tagInternalConns(*jc.CacheSConns, utils.MetaCaches) } @@ -103,6 +106,7 @@ func (c PrometheusAgentCfg) AsMapInterface() any { utils.PathCfg: c.Path, utils.CollectGoMetricsCfg: c.CollectGoMetrics, utils.CollectProcessMetricsCfg: c.CollectProcessMetrics, + utils.AdminSConnsCfg: stripInternalConns(c.AdminSConns), utils.CacheSConnsCfg: stripInternalConns(c.CacheSConns), utils.CacheIDsCfg: stripInternalConns(c.CacheIDs), utils.CoreSConnsCfg: stripInternalConns(c.CoreSConns), @@ -121,6 +125,7 @@ func (c PrometheusAgentCfg) Clone() *PrometheusAgentCfg { Path: c.Path, CollectGoMetrics: c.CollectGoMetrics, CollectProcessMetrics: c.CollectProcessMetrics, + AdminSConns: slices.Clone(c.AdminSConns), CacheSConns: slices.Clone(c.CacheSConns), CacheIDs: slices.Clone(c.CacheIDs), CoreSConns: slices.Clone(c.CoreSConns), @@ -129,37 +134,6 @@ func (c PrometheusAgentCfg) Clone() *PrometheusAgentCfg { } } -func (c PrometheusAgentCfg) validate(cfg *CGRConfig) error { - if !c.Enabled { - return nil - } - for _, connID := range cfg.prometheusAgentCfg.CacheSConns { - if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) { - return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID) - } - } - for _, connID := range cfg.prometheusAgentCfg.CoreSConns { - if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) { - return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID) - } - } - for _, connID := range c.StatSConns { - if strings.HasPrefix(connID, utils.MetaInternal) && !cfg.statsCfg.Enabled { - return fmt.Errorf("<%s> not enabled but requested by <%s> component", utils.StatService, utils.PrometheusAgent) - } - if _, has := cfg.rpcConns[connID]; !has && !strings.HasPrefix(connID, utils.MetaInternal) { - return fmt.Errorf("<%s> connection with id: <%s> not defined", utils.PrometheusAgent, connID) - } - } - if len(c.CoreSConns) > 0 { - if c.CollectGoMetrics || c.CollectProcessMetrics { - return fmt.Errorf("<%s> collect_go_metrics and collect_process_metrics cannot be enabled when using CoreSConns", - utils.PrometheusAgent) - } - } - return nil -} - func diffPrometheusAgentJsonCfg(d *PrometheusAgentJsonCfg, v1, v2 *PrometheusAgentCfg) *PrometheusAgentJsonCfg { if d == nil { d = new(PrometheusAgentJsonCfg) @@ -177,6 +151,9 @@ func diffPrometheusAgentJsonCfg(d *PrometheusAgentJsonCfg, v1, v2 *PrometheusAge if v1.CollectProcessMetrics != v2.CollectProcessMetrics && true { d.CollectProcessMetrics = utils.BoolPointer(v2.CollectProcessMetrics) } + if !slices.Equal(v1.AdminSConns, v2.AdminSConns) { + d.AdminSConns = utils.SliceStringPointer(v2.AdminSConns) + } if !slices.Equal(v1.CoreSConns, v2.CoreSConns) { d.CoreSConns = utils.SliceStringPointer(v2.CoreSConns) } diff --git a/docs/prometheus.rst b/docs/prometheus.rst index 6fc739360..592e90758 100644 --- a/docs/prometheus.rst +++ b/docs/prometheus.rst @@ -21,6 +21,7 @@ Example configuration in the JSON file: "prometheus_agent": { "enabled": true, "path": "/prometheus", + "apiers_conns": ["*internal", "external"], "caches_conns": ["*internal"], "cache_ids": [ "*attribute_filter_indexes", @@ -43,6 +44,9 @@ enabled path HTTP endpoint path where Prometheus metrics will be exposed, e.g., "/prometheus" or "/metrics" +apiers_conns + List of connection IDs to ApierS components. Required when stat_queue_ids is empty to fetch all available StatQueue profile IDs. Must match the length of stats_conns when auto-fetching is used. Possible values: <""|*internal|$rpc_conns_id> + caches_conns List of connection IDs to CacheS components for collecting cache statistics. Empty list disables cache metrics collection. Possible values: <""|*internal|$rpc_conns_id> @@ -56,7 +60,7 @@ stats_conns List of connection IDs to StatS components for collecting StatQueue metrics. Empty list disables StatQueue metrics collection. Possible values: <""|*internal|$rpc_conns_id> stat_queue_ids - List of StatQueue IDs to collect metrics from. Can include tenant in format <[tenant]:ID>. If tenant is not specified, default tenant from general configuration is used. + List of StatQueue IDs to collect metrics from. Can include tenant in format <[tenant]:ID>. If tenant is not specified, default tenant from general configuration is used. Leave empty to automatically collect metrics from all available StatQueues (requires apiers_conns). Available Metrics ----------------- @@ -135,6 +139,11 @@ The PrometheusAgent operates differently than other CGRateS components that use - When multiple connections are configured in caches_conns, the agent collects cache statistics from **all** connections for the specified cache_ids - The agent processes metrics requests only when Prometheus sends a scrape request to the configured HTTP endpoint +StatQueue metrics are collected based on the ``stat_queue_ids`` configuration. When specific StatQueue IDs are provided, only those StatQueues are monitored. When ``stat_queue_ids`` is left empty, all available StatQueues are monitored by fetching StatQueue profile IDs from the configured ``apiers_conns``. + +.. note:: + When fetching all StatQueues (empty stat_queue_ids), each ApierS connection in ``apiers_conns`` corresponds to its StatS counterpart at the same index position in ``stats_conns``. + You can view all exported metrics and see what Prometheus would scrape by making a simple curl request to the HTTP endpoint: .. code-block:: bash diff --git a/general_tests/prometheus_it_test.go b/general_tests/prometheus_it_test.go index 12e359abd..4d180ed08 100644 --- a/general_tests/prometheus_it_test.go +++ b/general_tests/prometheus_it_test.go @@ -24,7 +24,6 @@ import ( "bytes" "fmt" "io" - "math/rand" "net/http" "testing" "time" @@ -76,7 +75,8 @@ func TestPrometheusAgentIT(t *testing.T) { "*stat_filter_indexes", "*rpc_connections" ], - "stats_conns": ["*localhost", "external"], + // "apiers_conns": ["*internal", "external"], + "stats_conns": ["*internal", "external"], "stat_queue_ids": ["cgrates.org:SQ_1","SQ_2"] } }` @@ -169,8 +169,8 @@ func processStats(t *testing.T, client *birpc.Client) { ID: utils.GenUUID(), Event: map[string]any{}, APIOpts: map[string]any{ - utils.MetaUsage: time.Duration(rand.Intn(3600)+60) * time.Second, - utils.MetaCost: rand.Float64()*20 + 0.1, + utils.MetaUsage: time.Duration(i) * time.Second, + utils.MetaCost: i * 10, utils.OptsStatsProfileIDs: fmt.Sprintf("SQ_%d", i+1), }, }, &reply); err != nil {