Documentation

adrs/030-distributed-caching-strategy.md

ADR-030: Distributed Caching Strategy

Status

Proposed

Context

As Dynaplex scales to handle more users and data, we need a caching strategy to improve performance and reduce database load. Currently, we have minimal caching, leading to repeated database queries for the same data. With our microservices architecture, we need a distributed caching solution that can be shared across services.

Current challenges:

  • Repeated database queries for reference data
  • No cache invalidation strategy
  • Each service instance has its own memory cache (not shared)
  • Poor performance for frequently accessed data
  • No caching for API responses

Requirements:

  • Distributed cache accessible by all service instances
  • Cache invalidation across services
  • Support for different cache expiration policies
  • High availability and performance
  • Cost-effective solution

Decision

We propose implementing a distributed caching strategy with a provider-agnostic abstraction layer, allowing us to evaluate and choose the best caching solution for our needs.

Initial evaluation candidates:

  • Redis (leading candidate)
  • Azure Cache for Redis (managed Redis)
  • NCache
  • Hazelcast
  • SQL Server In-Memory tables (for specific scenarios)

Consequences

Positive

  • Performance: Significantly faster data access
  • Scalability: Reduced database load
  • Cost Savings: Fewer database queries
  • Consistency: Shared cache across instances
  • Flexibility: Provider-agnostic design
  • Resilience: Services can operate with cached data if DB is down

Negative

  • Complexity: Cache invalidation is hard
  • Cost: Additional infrastructure required
  • Consistency: Potential for stale data
  • Debugging: Harder to debug cache-related issues
  • Memory: Additional memory requirements

Neutral

  • Provider Selection: Must evaluate and choose provider
  • Network Latency: Cache calls have network overhead
  • Monitoring: Need cache-specific monitoring

Proposed Implementation

Cache Abstraction Layer

public interface IDistributedCacheService
{
    Task<T?> GetAsync<T>(string key, CancellationToken cancellationToken = default);
    Task SetAsync<T>(string key, T value, CacheOptions? options = null, CancellationToken cancellationToken = default);
    Task RemoveAsync(string key, CancellationToken cancellationToken = default);
    Task RemoveByPatternAsync(string pattern, CancellationToken cancellationToken = default);
    Task<bool> ExistsAsync(string key, CancellationToken cancellationToken = default);
    Task<T> GetOrCreateAsync<T>(string key, Func<Task<T>> factory, CacheOptions? options = null, CancellationToken cancellationToken = default);
}

public class CacheOptions
{
    public TimeSpan? AbsoluteExpiration { get; set; }
    public TimeSpan? SlidingExpiration { get; set; }
    public CachePriority Priority { get; set; } = CachePriority.Normal;
    public string[]? Tags { get; set; }
}

public enum CachePriority
{
    Low,
    Normal,
    High,
    NeverRemove
}

Redis Implementation

public class RedisCacheService : IDistributedCacheService
{
    private readonly IConnectionMultiplexer _redis;
    private readonly IDatabase _database;
    private readonly ILogger<RedisCacheService> _logger;
    private readonly JsonSerializerOptions _jsonOptions;
    
    public RedisCacheService(IConnectionMultiplexer redis, ILogger<RedisCacheService> logger)
    {
        _redis = redis;
        _database = redis.GetDatabase();
        _logger = logger;
        _jsonOptions = new JsonSerializerOptions
        {
            PropertyNamingPolicy = JsonNamingPolicy.CamelCase
        };
    }
    
    public async Task<T?> GetAsync<T>(string key, CancellationToken cancellationToken = default)
    {
        try
        {
            var value = await _database.StringGetAsync(key);
            
            if (value.IsNullOrEmpty)
                return default;
            
            return JsonSerializer.Deserialize<T>(value!, _jsonOptions);
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error getting cache key {Key}", key);
            return default; // Fail gracefully
        }
    }
    
    public async Task SetAsync<T>(string key, T value, CacheOptions? options = null, CancellationToken cancellationToken = default)
    {
        try
        {
            var json = JsonSerializer.Serialize(value, _jsonOptions);
            
            var expiry = options?.AbsoluteExpiration ?? TimeSpan.FromMinutes(5);
            
            await _database.StringSetAsync(key, json, expiry);
            
            // Handle tags for invalidation
            if (options?.Tags != null)
            {
                foreach (var tag in options.Tags)
                {
                    await _database.SetAddAsync($"tag:{tag}", key);
                }
            }
        }
        catch (Exception ex)
        {
            _logger.LogError(ex, "Error setting cache key {Key}", key);
            // Don't throw - caching should not break the application
        }
    }
    
    public async Task RemoveByPatternAsync(string pattern, CancellationToken cancellationToken = default)
    {
        var server = _redis.GetServer(_redis.GetEndPoints().First());
        var keys = server.Keys(pattern: pattern).ToArray();
        
        if (keys.Any())
        {
            await _database.KeyDeleteAsync(keys);
        }
    }
    
    public async Task<T> GetOrCreateAsync<T>(string key, Func<Task<T>> factory, CacheOptions? options = null, CancellationToken cancellationToken = default)
    {
        var cached = await GetAsync<T>(key, cancellationToken);
        
        if (cached != null)
            return cached;
        
        var value = await factory();
        await SetAsync(key, value, options, cancellationToken);
        
        return value;
    }
}

Cache Key Strategy

public static class CacheKeys
{
    // Hierarchical key structure
    public static string Asset(int id) => $"asset:{id}";
    public static string AssetsByType(int typeId) => $"assets:type:{typeId}";
    public static string AssetsByLocation(int locationId) => $"assets:location:{locationId}";
    public static string AssetTypes() => "asset-types:all";
    public static string AssetType(int id) => $"asset-type:{id}";
    public static string User(Guid userId) => $"user:{userId}";
    public static string UserPermissions(Guid userId) => $"user:{userId}:permissions";
    public static string LocationHierarchy(int rootId) => $"location:hierarchy:{rootId}";
    
    // Versioned keys for breaking changes
    public static string Versioned(string key, int version = 1) => $"v{version}:{key}";
}

Service Integration

public class AssetService
{
    private readonly IDistributedCacheService _cache;
    private readonly CatalogDb _db;
    private readonly IEventBus _eventBus;
    
    public async Task<Asset?> GetAssetAsync(int id)
    {
        return await _cache.GetOrCreateAsync(
            CacheKeys.Asset(id),
            async () => await _db.Assets
                .Include(a => a.Type)
                .Include(a => a.Location)
                .FirstOrDefaultAsync(a => a.Id == id),
            new CacheOptions 
            { 
                AbsoluteExpiration = TimeSpan.FromMinutes(15),
                Tags = new[] { "assets" }
            });
    }
    
    public async Task<Asset> UpdateAssetAsync(int id, UpdateAssetRequest request)
    {
        var asset = await _db.Assets.FindAsync(id);
        // Update asset...
        await _db.SaveChangesAsync();
        
        // Invalidate caches
        await _cache.RemoveAsync(CacheKeys.Asset(id));
        await _cache.RemoveAsync(CacheKeys.AssetsByType(asset.TypeId));
        await _cache.RemoveAsync(CacheKeys.AssetsByLocation(asset.LocationId));
        
        // Publish cache invalidation event for other services
        await _eventBus.PublishAsync(new CacheInvalidationEvent
        {
            Keys = new[] { CacheKeys.Asset(id) },
            Tags = new[] { "assets" }
        });
        
        return asset;
    }
}

Cache Invalidation Strategies

1. Time-based Expiration

// Short-lived for frequently changing data
new CacheOptions { AbsoluteExpiration = TimeSpan.FromMinutes(5) }

// Long-lived for reference data
new CacheOptions { AbsoluteExpiration = TimeSpan.FromHours(24) }

// Sliding expiration for active data
new CacheOptions { SlidingExpiration = TimeSpan.FromMinutes(10) }

2. Event-based Invalidation

public class CacheInvalidationHandler : IEventHandler<AssetUpdatedEvent>
{
    private readonly IDistributedCacheService _cache;
    
    public async Task HandleAsync(AssetUpdatedEvent @event)
    {
        await _cache.RemoveAsync(CacheKeys.Asset(@event.AssetId));
        await _cache.RemoveByPatternAsync($"assets:*:{@event.AssetId}");
    }
}

3. Tag-based Invalidation

public async Task InvalidateTagAsync(string tag)
{
    var keys = await _database.SetMembersAsync($"tag:{tag}");
    
    if (keys.Any())
    {
        await _database.KeyDeleteAsync(keys.Select(k => (RedisKey)k.ToString()).ToArray());
        await _database.KeyDeleteAsync($"tag:{tag}");
    }
}

Cache Warming

public class CacheWarmingService : BackgroundService
{
    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        // Warm up critical caches on startup
        await WarmReferenceDataAsync();
        
        // Periodic refresh
        using var timer = new PeriodicTimer(TimeSpan.FromHours(1));
        
        while (await timer.WaitForNextTickAsync(stoppingToken))
        {
            await RefreshExpiringCachesAsync();
        }
    }
    
    private async Task WarmReferenceDataAsync()
    {
        // Pre-load frequently accessed reference data
        var assetTypes = await _db.AssetTypes.ToListAsync();
        await _cache.SetAsync(CacheKeys.AssetTypes(), assetTypes,
            new CacheOptions { AbsoluteExpiration = TimeSpan.FromHours(24) });
        
        var locations = await _db.Locations.ToListAsync();
        await _cache.SetAsync(CacheKeys.LocationHierarchy(0), locations,
            new CacheOptions { AbsoluteExpiration = TimeSpan.FromHours(24) });
    }
}

Response Caching

// API response caching
app.MapGet("/api/assets", GetAssets)
    .CacheOutput(policy => policy
        .Expire(TimeSpan.FromMinutes(5))
        .Tag("assets")
        .VaryByQuery("page", "pageSize", "filter"));

// Custom cache policy
public class AssetCachePolicy : IOutputCachePolicy
{
    public ValueTask CacheRequestAsync(OutputCacheContext context, CancellationToken cancellation)
    {
        var attemptOutputCaching = context.HttpContext.Request.Method == HttpMethods.Get;
        context.EnableOutputCaching = attemptOutputCaching;
        context.AllowCacheLookup = attemptOutputCaching;
        context.AllowCacheStorage = attemptOutputCaching;
        context.AllowLocking = true;
        
        context.CacheVaryByRules.QueryKeys = "*";
        context.ResponseExpirationTimeSpan = TimeSpan.FromMinutes(5);
        
        return ValueTask.CompletedTask;
    }
}

Monitoring and Metrics

public class CacheMetrics
{
    private readonly IMeterProvider _meterProvider;
    private readonly Counter<long> _hits;
    private readonly Counter<long> _misses;
    private readonly Histogram<double> _duration;
    
    public CacheMetrics(IMeterProvider meterProvider)
    {
        var meter = meterProvider.GetMeter("Dynaplex.Cache");
        _hits = meter.CreateCounter<long>("cache.hits");
        _misses = meter.CreateCounter<long>("cache.misses");
        _duration = meter.CreateHistogram<double>("cache.operation.duration", "ms");
    }
    
    public void RecordHit(string key) => _hits.Add(1, new("key", key));
    public void RecordMiss(string key) => _misses.Add(1, new("key", key));
    public void RecordDuration(double ms, string operation) => 
        _duration.Record(ms, new("operation", operation));
}

Configuration

{
  "Caching": {
    "Provider": "Redis",
    "Redis": {
      "Configuration": "localhost:6379,abortConnect=false",
      "InstanceName": "dynaplex",
      "DefaultExpiration": "00:05:00"
    },
    "Policies": {
      "ReferenceData": {
        "Duration": "24:00:00",
        "Priority": "High"
      },
      "UserSession": {
        "Duration": "00:20:00",
        "SlidingExpiration": true
      },
      "ApiResponse": {
        "Duration": "00:01:00",
        "VaryByUser": true
      }
    }
  }
}

Evaluation Criteria for Providers

  1. Performance: Latency, throughput
  2. Scalability: Clustering, sharding support
  3. Features: Data structures, pub/sub, transactions
  4. Cost: Infrastructure and operational costs
  5. Maintenance: Managed vs self-hosted
  6. Integration: .NET client quality

Migration Plan

  1. Phase 1: Implement cache abstraction
  2. Phase 2: Deploy Redis in development
  3. Phase 3: Cache reference data
  4. Phase 4: Add response caching
  5. Phase 5: Implement invalidation
  6. Phase 6: Monitor and optimize
  • ADR-027: Event-Driven Architecture (cache invalidation events)
  • ADR-019: OpenTelemetry Integration (cache metrics)
  • ADR-031: API Gateway Pattern (centralized caching)