prometheus: add node_id label to cache metrics

This commit is contained in:
ionutboangiu
2025-07-18 22:36:55 +03:00
committed by Dan Christian Bogos
parent 71a1242522
commit e8c770c415
10 changed files with 66 additions and 49 deletions

View File

@@ -27,7 +27,6 @@ import (
"github.com/cgrates/cgrates/cores"
"github.com/cgrates/cgrates/engine"
"github.com/cgrates/cgrates/utils"
"github.com/cgrates/ltcache"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
@@ -107,7 +106,7 @@ func NewPrometheusAgent(cfg *config.CGRConfig, cm *engine.ConnManager) *Promethe
Subsystem: "cache",
Name: "groups_total",
Help: "Total number of cache groups",
}, []string{"cache"})
}, []string{"cache", "node_id"})
reg.MustRegister(cacheGroupsMetric)
cacheItemsMetric := prometheus.NewGaugeVec(
@@ -116,7 +115,7 @@ func NewPrometheusAgent(cfg *config.CGRConfig, cm *engine.ConnManager) *Promethe
Subsystem: "cache",
Name: "items_total",
Help: "Total number of cache items",
}, []string{"cache"})
}, []string{"cache", "node_id"})
reg.MustRegister(cacheItemsMetric)
statMetrics := prometheus.NewGaugeVec(
prometheus.GaugeOpts{
@@ -197,24 +196,26 @@ func (pa *PrometheusAgent) updateCacheStats() {
return
}
for _, connID := range pa.cfg.PrometheusAgentCfg().CacheSConns {
var cacheStats map[string]*ltcache.CacheStats
var reply engine.CacheStatsWithMetadata
if err := pa.cm.Call(context.Background(), []string{connID},
utils.CacheSv1GetCacheStats,
utils.CacheSv1GetStats,
&utils.AttrCacheIDsWithAPIOpts{
CacheIDs: pa.cfg.PrometheusAgentCfg().CacheIDs,
}, &cacheStats); err != nil {
}, &reply); err != nil {
utils.Logger.Err(fmt.Sprintf(
"<%s> failed to retrieve cache stats (connID=%q): %v",
utils.PrometheusAgent, connID, err))
continue
}
cacheStats := reply.CacheStatistics
nodeID := utils.IfaceAsString(reply.Metadata[utils.NodeID])
for cacheID, stats := range cacheStats {
if stats == nil {
continue
}
pa.cacheGroupsMetric.WithLabelValues(cacheID).Set(float64(stats.Groups))
pa.cacheItemsMetric.WithLabelValues(cacheID).Set(float64(stats.Items))
pa.cacheGroupsMetric.WithLabelValues(cacheID, nodeID).Set(float64(stats.Groups))
pa.cacheItemsMetric.WithLabelValues(cacheID, nodeID).Set(float64(stats.Items))
}
}
}

View File

@@ -25,7 +25,6 @@ import (
"github.com/cgrates/cgrates/engine"
"github.com/cgrates/cgrates/utils"
"github.com/cgrates/ltcache"
)
type CacheSv1 struct {
@@ -85,10 +84,10 @@ func (chSv1 *CacheSv1) Clear(ctx *context.Context, args *utils.AttrCacheIDsWithA
return chSv1.cacheS.V1Clear(ctx, args, reply)
}
// GetCacheStats returns CacheStats filtered by cacheIDs
func (chSv1 *CacheSv1) GetCacheStats(ctx *context.Context, args *utils.AttrCacheIDsWithAPIOpts, //
rply *map[string]*ltcache.CacheStats) error {
return chSv1.cacheS.V1GetCacheStats(ctx, args, rply)
// GetStats returns CacheStats filtered by cacheIDs
func (chSv1 *CacheSv1) GetStats(ctx *context.Context, args *utils.AttrCacheIDsWithAPIOpts,
rply *engine.CacheStatsWithMetadata) error {
return chSv1.cacheS.V1GetStats(ctx, args, rply)
}
// PrecacheStatus checks status of active precache processes

View File

@@ -26,8 +26,6 @@ import (
"testing"
"time"
"github.com/cgrates/ltcache"
"github.com/cgrates/birpc/context"
"github.com/cgrates/cgrates/engine"
@@ -342,16 +340,16 @@ func testCacheSLoadCache(t *testing.T) {
t.Errorf("Unexpected reply returned")
}
var rcvStats map[string]*ltcache.CacheStats
var rcvStats engine.CacheStatsWithMetadata
expstats := engine.GetDefaultEmptyCacheStats()
expstats[utils.CacheAttributeProfiles].Items = 1
expstats[utils.CacheAttributeFilterIndexes].Groups = 1
expstats[utils.CacheAttributeFilterIndexes].Items = 1
expstats[utils.CacheLoadIDs].Items = 28
if err := chcRPC.Call(context.Background(), utils.CacheSv1GetCacheStats,
if err := chcRPC.Call(context.Background(), utils.CacheSv1GetStats,
new(utils.AttrCacheIDsWithAPIOpts), &rcvStats); err != nil {
t.Error(err)
} else if !reflect.DeepEqual(rcvStats, expstats) {
} else if !reflect.DeepEqual(rcvStats.CacheStatistics, expstats) {
t.Errorf("Expected %+v \n, received %+v", utils.ToJSON(expstats), utils.ToJSON(rcvStats))
}
}
@@ -518,7 +516,7 @@ func testCacheGetStatusMoreIDs(t *testing.T) {
t.Errorf("Unexpected reply returned")
}
var rcvStats map[string]*ltcache.CacheStats
var rcvStats engine.CacheStatsWithMetadata
expstats := engine.GetDefaultEmptyCacheStats()
expstats[utils.CacheAttributeProfiles].Items = 4
expstats[utils.CacheAttributeFilterIndexes].Groups = 1
@@ -531,10 +529,10 @@ func testCacheGetStatusMoreIDs(t *testing.T) {
expstats[utils.CacheFilters].Items = 6
expstats[utils.CacheRPCConnections].Items = 1
expstats[utils.CacheLoadIDs].Items = 28
if err := chcRPC.Call(context.Background(), utils.CacheSv1GetCacheStats,
if err := chcRPC.Call(context.Background(), utils.CacheSv1GetStats,
new(utils.AttrCacheIDsWithAPIOpts), &rcvStats); err != nil {
t.Error(err)
} else if !reflect.DeepEqual(rcvStats, expstats) {
} else if !reflect.DeepEqual(rcvStats.CacheStatistics, expstats) {
t.Errorf("Expected %+v \n, received %+v", utils.ToJSON(expstats), utils.ToJSON(rcvStats))
}
}
@@ -549,12 +547,12 @@ func testCacheSClearCache(t *testing.T) {
}
//all cache cleared, empty items in cache
var rcvStats map[string]*ltcache.CacheStats
var rcvStats engine.CacheStatsWithMetadata
expStats := engine.GetDefaultEmptyCacheStats()
if err := chcRPC.Call(context.Background(), utils.CacheSv1GetCacheStats,
if err := chcRPC.Call(context.Background(), utils.CacheSv1GetStats,
new(utils.AttrCacheIDsWithAPIOpts), &rcvStats); err != nil {
t.Error(err)
} else if !reflect.DeepEqual(rcvStats, expStats) {
} else if !reflect.DeepEqual(rcvStats.CacheStatistics, expStats) {
t.Errorf("Expected %+v \n, received %+v", utils.ToJSON(expStats), utils.ToJSON(rcvStats))
}
}

View File

@@ -27,7 +27,6 @@ import (
"github.com/cgrates/cgrates/config"
"github.com/cgrates/cgrates/engine"
"github.com/cgrates/cgrates/utils"
"github.com/cgrates/ltcache"
)
func TestCacheHasItemAndGetItem(t *testing.T) {
@@ -293,18 +292,19 @@ func TestGetCacheStats(t *testing.T) {
ch := engine.NewCacheS(cfg, dm, connMgr, nil)
cache := NewCacheSv1(ch)
ch.SetWithoutReplicate(utils.CacheAttributeProfiles, "cgrates.org:TestGetCacheStats", nil, nil, true, utils.NonTransactional)
var reply map[string]*ltcache.CacheStats
var reply engine.CacheStatsWithMetadata
args := &utils.AttrCacheIDsWithAPIOpts{
Tenant: "cgrates.org",
APIOpts: map[string]any{},
CacheIDs: []string{utils.CacheAttributeProfiles},
}
if err := cache.GetCacheStats(context.Background(), args, &reply); err != nil {
if err := cache.GetStats(context.Background(), args, &reply); err != nil {
t.Error(err)
}
if reply[utils.CacheAttributeProfiles].Items != 1 {
t.Errorf("Expected 1\n but received %v", reply[utils.CacheAttributeProfiles].Items)
cacheStats := reply.CacheStatistics
if cacheStats[utils.CacheAttributeProfiles].Items != 1 {
t.Errorf("Expected 1\n but received %v", cacheStats[utils.CacheAttributeProfiles].Items)
}
}

View File

@@ -19,14 +19,14 @@ along with this program. If not, see <http://www.gnu.org/licenses/>
package console
import (
"github.com/cgrates/cgrates/engine"
"github.com/cgrates/cgrates/utils"
"github.com/cgrates/ltcache"
)
func init() {
c := &CmdGetCacheStats{
name: "cache_stats",
rpcMethod: utils.CacheSv1GetCacheStats,
rpcMethod: utils.CacheSv1GetStats,
rpcParams: &utils.AttrCacheIDsWithAPIOpts{},
}
commands[c.Name()] = c
@@ -61,6 +61,5 @@ func (self *CmdGetCacheStats) PostprocessRpcParams() error {
}
func (self *CmdGetCacheStats) RpcResult() any {
reply := make(map[string]*ltcache.CacheStats)
return &reply
return &engine.CacheStatsWithMetadata{}
}

View File

@@ -7,7 +7,7 @@ PrometheusAgent
1. **Core metrics** - collected from configured CGRateS engines via CoreSv1.Status API
2. **StatQueue metrics** - values from CGRateS :ref:`StatS <stats>` component, collected via StatSv1.GetQueueFloatMetrics API
3. **Cache statistics** - collected from configured :ref:`CacheS <caches>` components via CacheSv1.GetCacheStats API
3. **Cache statistics** - collected from configured :ref:`CacheS <caches>` components via CacheSv1.GetStats API
For core metrics, the agent computes real-time values on each Prometheus scrape request. For StatQueue metrics, it retrieves the current state of the stored StatQueues without additional calculations. For cache statistics, it collects current cache utilization data from the configured cache partitions.
@@ -103,9 +103,10 @@ The PrometheusAgent exposes the following metrics:
go_memstats_alloc_bytes{node_id="e94160b"} 1.1360808e+07
3. **Cache Metrics** (when caches_conns is configured)
- Two separate metrics for cache statistics: ``cgrates_cache_groups_total`` and ``cgrates_cache_items_total`` with cache partition ID label
- Two separate metrics for cache statistics: ``cgrates_cache_groups_total`` and ``cgrates_cache_items_total`` with cache partition ID and node_id labels
- Obtained from CacheS services on each scrape request
- Useful for identifying memory usage patterns and potential performance issues
- Includes node_id labels for multi-engine environments, allowing collection from multiple CGRateS engines
Example of cache metrics output:
@@ -113,15 +114,15 @@ The PrometheusAgent exposes the following metrics:
# HELP cgrates_cache_groups_total Total number of cache groups
# TYPE cgrates_cache_groups_total gauge
cgrates_cache_groups_total{cache="*attribute_filter_indexes"} 2
cgrates_cache_groups_total{cache="*charger_profiles"} 0
cgrates_cache_groups_total{cache="*rpc_connections"} 0
cgrates_cache_groups_total{cache="*attribute_filter_indexes",node_id="dc2cb63"} 2
cgrates_cache_groups_total{cache="*charger_profiles",node_id="dc2cb63"} 0
cgrates_cache_groups_total{cache="*rpc_connections",node_id="dc2cb63"} 0
# HELP cgrates_cache_items_total Total number of cache items
# TYPE cgrates_cache_items_total gauge
cgrates_cache_items_total{cache="*attribute_filter_indexes"} 6
cgrates_cache_items_total{cache="*charger_profiles"} 2
cgrates_cache_items_total{cache="*rpc_connections"} 1
cgrates_cache_items_total{cache="*attribute_filter_indexes",node_id="dc2cb63"} 6
cgrates_cache_items_total{cache="*charger_profiles",node_id="dc2cb63"} 2
cgrates_cache_items_total{cache="*rpc_connections",node_id="dc2cb63"} 1
How It Works

View File

@@ -396,6 +396,22 @@ func (chS *CacheS) V1GetCacheStats(ctx *context.Context, args *utils.AttrCacheID
return
}
type CacheStatsWithMetadata struct {
CacheStatistics map[string]*ltcache.CacheStats
Metadata map[string]any
}
func (chS *CacheS) V1GetStats(ctx *context.Context, args *utils.AttrCacheIDsWithAPIOpts,
rply *CacheStatsWithMetadata) error {
*rply = CacheStatsWithMetadata{
CacheStatistics: chS.tCache.GetCacheStats(args.CacheIDs),
Metadata: map[string]any{
utils.NodeID: chS.cfg.GeneralCfg().NodeID,
},
}
return nil
}
func (chS *CacheS) V1PrecacheStatus(_ *context.Context, args *utils.AttrCacheIDsWithAPIOpts, rply *map[string]string) (err error) {
if len(args.CacheIDs) == 0 {
args.CacheIDs = utils.CachePartitions.AsSlice()

View File

@@ -58,8 +58,13 @@ func TestPrometheusAgentIT(t *testing.T) {
"prometheus_agent": {
"enabled": true,
"path": "/metrics",
"collect_go_metrics": false,
"collect_process_metrics": false,
"caches_conns": ["*localhost", "external"],
"cache_ids": [
"*statqueue_profiles",
"*statqueues",
"*stat_filter_indexes",
"*rpc_connections"
],
"stats_conns": ["*localhost", "external"],
"stat_queue_ids": ["cgrates.org:SQ_1","SQ_2"]
}

View File

@@ -32,7 +32,6 @@ import (
"github.com/cgrates/cgrates/engine"
"github.com/cgrates/cgrates/loaders"
"github.com/cgrates/cgrates/utils"
"github.com/cgrates/ltcache"
)
var (
@@ -134,7 +133,7 @@ func testTutSMGCacheStats(t *testing.T) {
// Actions: 9, ActionPlans: 4, AccountActionPlans: 5, SharedGroups: 1, ResourceProfiles: 3,
// Resources: 3, StatQueues: 1, StatQueueProfiles: 1, Thresholds: 7, ThresholdProfiles: 7, Filters: 15,
// SupplierProfiles: 3, AttributeProfiles: 2}
var rcvStats map[string]*ltcache.CacheStats
var rcvStats engine.CacheStatsWithMetadata
expectedStats := engine.GetDefaultEmptyCacheStats()
expectedStats[utils.CacheAccountsFilterIndexes].Items = 1
expectedStats[utils.CacheAccountsFilterIndexes].Groups = 1
@@ -184,9 +183,9 @@ func testTutSMGCacheStats(t *testing.T) {
expectedStats[utils.CacheAttributeFilterIndexes].Groups = 1
expectedStats[utils.CacheReverseFilterIndexes].Items = 19
expectedStats[utils.CacheReverseFilterIndexes].Groups = 16
if err := tutSMGRpc.Call(context.Background(), utils.CacheSv1GetCacheStats, new(utils.AttrCacheIDsWithAPIOpts), &rcvStats); err != nil {
if err := tutSMGRpc.Call(context.Background(), utils.CacheSv1GetStats, new(utils.AttrCacheIDsWithAPIOpts), &rcvStats); err != nil {
t.Error("Got error on CacheSv1.GetCacheStats: ", err.Error())
} else if !reflect.DeepEqual(expectedStats, rcvStats) {
} else if !reflect.DeepEqual(expectedStats, rcvStats.CacheStatistics) {
t.Errorf("expected: %+v,\n received: %+v", utils.ToJSON(expectedStats), utils.ToJSON(rcvStats))
}
}

View File

@@ -1438,7 +1438,6 @@ const (
// APIerSv1GetDataCost = "APIerSv1.GetDataCost"
// APIerSv1ReplayFailedPosts = "APIerSv1.ReplayFailedPosts"
// APIerSv1GetCacheStats = "APIerSv1.GetCacheStats"
// APIerSv1ReloadCache = "APIerSv1.ReloadCache"
// APIerSv1RemoveActions = "APIerSv1.RemoveActions"
// APIerSv1GetLoadHistory = "APIerSv1.GetLoadHistory"
@@ -1764,7 +1763,7 @@ const (
// CacheS APIs
const (
CacheSv1 = "CacheSv1"
CacheSv1GetCacheStats = "CacheSv1.GetCacheStats"
CacheSv1GetStats = "CacheSv1.GetStats"
CacheSv1GetItemIDs = "CacheSv1.GetItemIDs"
CacheSv1HasItem = "CacheSv1.HasItem"
CacheSv1GetItem = "CacheSv1.GetItem"