Documentation

operations/operations.md

Operations Guide for Dynaplex Architecture

This guide covers monitoring, observability, deployment, and operational procedures for the Acsis Core Dynaplex architecture.

🎯 Operational Overview

The Dynaplex architecture requires comprehensive operational practices:

  • Observability: Distributed tracing, metrics, and logging
  • Health Monitoring: Service health checks and dependency tracking
  • Performance Management: Resource monitoring and optimization
  • Incident Response: Detection, diagnosis, and resolution
  • Deployment: Zero-downtime deployments and rollbacks

📊 Observability Stack

Architecture Overview

graph TB
    subgraph Services
        A[Service A]
        B[Service B]
        C[Service C]
    end

    subgraph Telemetry
        D[OpenTelemetry Collector]
        E[Prometheus]
        F[Jaeger]
        G[Elasticsearch]
    end

    subgraph Visualization
        H[Grafana]
        I[Kibana]
        J[Aspire Dashboard]
    end

    A --> D
    B --> D
    C --> D
    D --> E
    D --> F
    D --> G
    E --> H
    G --> I
    D --> J

OpenTelemetry Configuration

// Program.cs - Service configuration
var builder = WebApplication.CreateBuilder(args);

// Add service defaults (includes OpenTelemetry)
builder.AddServiceDefaults();

// Configure OpenTelemetry
builder.Services.AddOpenTelemetry()
    .ConfigureResource(resource => resource
        .AddService(serviceName: "asset-service",
            serviceVersion: Assembly.GetExecutingAssembly().GetName().Version?.ToString(),
            serviceInstanceId: Environment.MachineName))
    .WithTracing(tracing => tracing
        .AddAspNetCoreInstrumentation()
        .AddHttpClientInstrumentation()
        .AddSqlClientInstrumentation()
        .AddEntityFrameworkCoreInstrumentation()
        .AddSource("Acsis.Dynaplex.Engines.Assets")
        .AddOtlpExporter(options =>
        {
            options.Endpoint = new Uri(configuration["Otlp:Endpoint"]);
        }))
    .WithMetrics(metrics => metrics
        .AddAspNetCoreInstrumentation()
        .AddHttpClientInstrumentation()
        .AddRuntimeInstrumentation()
        .AddMeter("Acsis.Dynaplex.Engines.Assets")
        .AddOtlpExporter(options =>
        {
            options.Endpoint = new Uri(configuration["Otlp:Endpoint"]);
        }));

// Custom activity source for business operations
public static class Telemetry
{
    public static readonly ActivitySource ActivitySource =
        new("Acsis.Dynaplex.Engines.Assets", "1.0.0");

    public static readonly Meter Meter =
        new("Acsis.Dynaplex.Engines.Assets", "1.0.0");

    public static readonly Counter<long> AssetCreatedCounter =
        Meter.CreateCounter<long>("assets.created", "Assets");

    public static readonly Histogram<double> AssetProcessingDuration =
        Meter.CreateHistogram<double>("assets.processing.duration", "ms");
}

// Usage in services
public class AssetService
{
    public async Task<Asset> CreateAssetAsync(CreateAssetRequest request)
    {
        using var activity = Telemetry.ActivitySource.StartActivity("CreateAsset");
        activity?.SetTag("asset.type", request.Type);

        var stopwatch = Stopwatch.StartNew();

        try
        {
            var asset = await _repository.CreateAsync(request);

            Telemetry.AssetCreatedCounter.Add(1,
                new KeyValuePair<string, object>("type", request.Type));

            activity?.SetTag("asset.id", asset.Id);
            activity?.SetStatus(ActivityStatusCode.Ok);

            return asset;
        }
        catch (Exception ex)
        {
            activity?.SetStatus(ActivityStatusCode.Error, ex.Message);
            throw;
        }
        finally
        {
            Telemetry.AssetProcessingDuration.Record(
                stopwatch.ElapsedMilliseconds,
                new KeyValuePair<string, object>("operation", "create"));
        }
    }
}

🏥 Health Monitoring

Health Check Implementation

// Health check configuration
builder.Services.AddHealthChecks()
    // Database health
    .AddSqlServer(
        configuration.GetConnectionString("Default"),
        name: "database",
        tags: new[] { "db", "sql", "critical" })

    // Redis cache health
    .AddRedis(
        configuration.GetConnectionString("Redis"),
        name: "cache",
        tags: new[] { "cache", "redis" })

    // Dependent service health
    .AddTypeActivatedCheck<ServiceHealthCheck>(
        "core-data-service",
        args: new object[] { "https://localhost:40443/health" },
        tags: new[] { "dependency", "core-data" })

    // Custom business logic health
    .AddCheck<BusinessLogicHealthCheck>(
        "business-logic",
        tags: new[] { "business" });

// Custom health check
public class BusinessLogicHealthCheck : IHealthCheck
{
    private readonly IServiceProvider _serviceProvider;

    public async Task<HealthCheckResult> CheckHealthAsync(
        HealthCheckContext context,
        CancellationToken cancellationToken = default)
    {
        try
        {
            using var scope = _serviceProvider.CreateScope();
            var service = scope.ServiceProvider.GetRequiredService<IAssetService>();

            // Verify critical business operations
            var canConnect = await service.VerifyConnectionAsync();
            var hasValidLicense = await service.CheckLicenseAsync();

            if (!canConnect)
                return HealthCheckResult.Unhealthy("Cannot connect to asset service");

            if (!hasValidLicense)
                return HealthCheckResult.Degraded("License expires soon");

            return HealthCheckResult.Healthy("All business operations functional");
        }
        catch (Exception ex)
        {
            return HealthCheckResult.Unhealthy("Business logic check failed", ex);
        }
    }
}

// Health check endpoints
app.MapHealthChecks("/health", new HealthCheckOptions
{
    ResponseWriter = UIResponseWriter.WriteHealthCheckUIResponse
});

app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
    Predicate = check => check.Tags.Contains("critical")
});

app.MapHealthChecks("/health/live", new HealthCheckOptions
{
    Predicate = _ => false // Only return 200 if service is alive
});

Aspire Dashboard

The .NET Aspire dashboard provides real-time monitoring:

# Access dashboard after starting AppHost
http://localhost:15045

# Features available:
- Service topology visualization
- Distributed trace viewing
- Log aggregation
- Metrics dashboards
- Resource consumption

📈 Metrics and Monitoring

Prometheus Configuration

# prometheus.yml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

scrape_configs:
  - job_name: 'aspire-services'
    static_configs:
      - targets:
        - 'localhost:40443'  # CoreData
        - 'localhost:41443'  # Importer
        - 'localhost:42443'  # SystemEnvironment
        - 'localhost:43443'  # Catalog
    metrics_path: '/metrics'

  - job_name: 'aspire-collector'
    static_configs:
      - targets: ['localhost:4317']

rule_files:
  - 'alerts.yml'

Alert Rules

# alerts.yml
groups:
  - name: service_alerts
    interval: 30s
    rules:
      - alert: ServiceDown
        expr: up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.instance }} has been down for more than 1 minute."

      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.service }}"
          description: "Error rate is above 5% for the last 5 minutes."

      - alert: HighLatency
        expr: histogram_quantile(0.95, http_request_duration_seconds_bucket) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency on {{ $labels.service }}"
          description: "95th percentile latency is above 1 second."

      - alert: DatabaseConnectionFailure
        expr: db_connection_failures_total > 5
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Database connection failures"
          description: "More than 5 database connection failures in 2 minutes."

Grafana Dashboards

{
  "dashboard": {
    "title": "Dynaplex Service Metrics",
    "panels": [
      {
        "title": "Request Rate",
        "targets": [
          {
            "expr": "rate(http_requests_total[5m])",
            "legendFormat": "{{service}} - {{method}}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "targets": [
          {
            "expr": "rate(http_requests_total{status=~\"5..\"}[5m])",
            "legendFormat": "{{service}}"
          }
        ]
      },
      {
        "title": "Response Time (p95)",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, http_request_duration_seconds_bucket)",
            "legendFormat": "{{service}}"
          }
        ]
      },
      {
        "title": "Active Connections",
        "targets": [
          {
            "expr": "http_connections_active",
            "legendFormat": "{{service}}"
          }
        ]
      }
    ]
  }
}

📝 Logging

Structured Logging Configuration

// Program.cs
builder.Host.UseSerilog((context, services, configuration) => configuration
    .ReadFrom.Configuration(context.Configuration)
    .ReadFrom.Services(services)
    .Enrich.FromLogContext()
    .Enrich.WithMachineName()
    .Enrich.WithEnvironmentName()
    .Enrich.WithProperty("Service", "AssetService")
    .WriteTo.Console(new JsonFormatter())
    .WriteTo.OpenTelemetry(options =>
    {
        options.Endpoint = configuration["Otlp:Endpoint"];
        options.ResourceAttributes = new Dictionary<string, object>
        {
            ["service.name"] = "asset-service"
        };
    }));

// Logging in services
public class AssetService
{
    private readonly ILogger<AssetService> _logger;

    public async Task<Asset> GetAssetAsync(Guid id)
    {
        using (_logger.BeginScope(new Dictionary<string, object>
        {
            ["AssetId"] = id,
            ["Operation"] = "GetAsset"
        }))
        {
            _logger.LogInformation("Retrieving asset {AssetId}", id);

            try
            {
                var asset = await _repository.GetByIdAsync(id);

                if (asset == null)
                {
                    _logger.LogWarning("Asset {AssetId} not found", id);
                    throw new NotFoundException($"Asset {id} not found");
                }

                _logger.LogInformation("Successfully retrieved asset {AssetId}", id);
                return asset;
            }
            catch (Exception ex)
            {
                _logger.LogError(ex, "Error retrieving asset {AssetId}", id);
                throw;
            }
        }
    }
}

Log Aggregation with ELK

# docker-compose.yml
services:
  elasticsearch:
    image: elasticsearch:8.0
    environment:
      - discovery.type=single-node
      - xpack.security.enabled=false
    ports:
      - "9200:9200"

  logstash:
    image: logstash:8.0
    volumes:
      - ./logstash.conf:/usr/share/logstash/pipeline/logstash.conf
    ports:
      - "5000:5000"

  kibana:
    image: kibana:8.0
    environment:
      - ELASTICSEARCH_HOSTS=http://elasticsearch:9200
    ports:
      - "5601:5601"
# logstash.conf
input {
  tcp {
    port => 5000
    codec => json
  }
}

filter {
  if [Properties][Service] {
    mutate {
      add_field => { "service" => "%{[Properties][Service]}" }
    }
  }

  if [Properties][AssetId] {
    mutate {
      add_field => { "asset_id" => "%{[Properties][AssetId]}" }
    }
  }
}

output {
  elasticsearch {
    hosts => ["elasticsearch:9200"]
    index => "dynaplex-%{service}-%{+YYYY.MM.dd}"
  }
}

🚀 Deployment

Deployment Pipeline

# azure-pipelines.yml
trigger:
  branches:
    include:
      - main
      - release/*

stages:
  - stage: Build
    jobs:
      - job: BuildServices
        steps:
          - task: DotNetCoreCLI@2
            displayName: 'Build Solution'
            inputs:
              command: 'build'
              projects: 'acsis-core.slnx'
              arguments: '--configuration Release'

          - task: DotNetCoreCLI@2
            displayName: 'Run Tests'
            inputs:
              command: 'test'
              projects: '**/*Tests.csproj'
              arguments: '--configuration Release --collect:"XPlat Code Coverage"'

  - stage: Package
    jobs:
      - job: CreateContainers
        steps:
          - task: Docker@2
            displayName: 'Build Service Images'
            inputs:
              command: 'buildAndPush'
              repository: 'acsis/$(Build.SourceBranchName)'
              dockerfile: '**/Dockerfile'
              containerRegistry: 'acsisRegistry'
              tags: |
                $(Build.BuildId)
                latest

  - stage: Deploy_Dev
    jobs:
      - deployment: DeployToDev
        environment: 'Development'
        strategy:
          runOnce:
            deploy:
              steps:
                - task: KubernetesManifest@0
                  displayName: 'Deploy to Kubernetes'
                  inputs:
                    action: 'deploy'
                    manifests: |
                      kubernetes/namespace.yaml
                      kubernetes/services/*.yaml
                      kubernetes/deployments/*.yaml

  - stage: Deploy_Prod
    condition: and(succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
    jobs:
      - deployment: DeployToProd
        environment: 'Production'
        strategy:
          canary:
            increments: [10, 50, 100]
            preDeploy:
              steps:
                - script: echo "Starting canary deployment"
            deploy:
              steps:
                - task: KubernetesManifest@0
                  inputs:
                    action: 'deploy'
                    strategy: 'canary'
                    percentage: $(strategy.increment)

Kubernetes Deployment

# kubernetes/deployments/asset-service.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: asset-service
  labels:
    app: asset-service
    component: dynaplex
spec:
  replicas: 3
  selector:
    matchLabels:
      app: asset-service
  template:
    metadata:
      labels:
        app: asset-service
      annotations:
        prometheus.io/scrape: "true"
        prometheus.io/port: "80"
        prometheus.io/path: "/metrics"
    spec:
      containers:
      - name: asset-service
        image: acsis/asset-service:latest
        ports:
        - containerPort: 80
        env:
        - name: ASPNETCORE_ENVIRONMENT
          value: "Production"
        - name: ConnectionStrings__Default
          valueFrom:
            secretKeyRef:
              name: db-connection
              key: connectionString
        livenessProbe:
          httpGet:
            path: /health/live
            port: 80
          initialDelaySeconds: 30
          periodSeconds: 10
        readinessProbe:
          httpGet:
            path: /health/ready
            port: 80
          initialDelaySeconds: 5
          periodSeconds: 5
        resources:
          requests:
            memory: "256Mi"
            cpu: "250m"
          limits:
            memory: "512Mi"
            cpu: "500m"
---
apiVersion: v1
kind: Service
metadata:
  name: asset-service
spec:
  selector:
    app: asset-service
  ports:
  - port: 80
    targetPort: 80
  type: ClusterIP

Zero-Downtime Deployment

// Graceful shutdown configuration
public class Program
{
    public static void Main(string[] args)
    {
        var host = CreateHostBuilder(args).Build();

        var lifetime = host.Services.GetRequiredService<IHostApplicationLifetime>();

        lifetime.ApplicationStopping.Register(() =>
        {
            Console.WriteLine("Application is shutting down...");
            // Stop accepting new requests
            // Wait for existing requests to complete
        });

        host.Run();
    }
}

// In Program.cs
builder.Host.ConfigureHostOptions(options =>
{
    options.ShutdownTimeout = TimeSpan.FromSeconds(30);
});

// Health check for deployment readiness
app.MapHealthChecks("/health/ready", new HealthCheckOptions
{
    Predicate = check => check.Tags.Contains("ready"),
    ResponseWriter = async (context, report) =>
    {
        context.Response.ContentType = "application/json";

        var result = JsonSerializer.Serialize(new
        {
            status = report.Status.ToString(),
            checks = report.Entries.Select(e => new
            {
                name = e.Key,
                status = e.Value.Status.ToString(),
                description = e.Value.Description
            })
        });

        await context.Response.WriteAsync(result);
    }
});

🔥 Incident Response

Incident Detection

// Anomaly detection service
public class AnomalyDetectionService
{
    private readonly ILogger<AnomalyDetectionService> _logger;
    private readonly IMetrics _metrics;

    public async Task MonitorServiceHealth()
    {
        var errorRate = await _metrics.GetErrorRateAsync(TimeSpan.FromMinutes(5));
        var responseTime = await _metrics.GetAverageResponseTimeAsync(TimeSpan.FromMinutes(5));
        var throughput = await _metrics.GetThroughputAsync(TimeSpan.FromMinutes(5));

        // Check for anomalies
        if (errorRate > 0.05) // 5% error rate
        {
            await RaiseIncident(IncidentSeverity.High,
                $"Error rate {errorRate:P} exceeds threshold");
        }

        if (responseTime > TimeSpan.FromSeconds(2))
        {
            await RaiseIncident(IncidentSeverity.Medium,
                $"Response time {responseTime.TotalSeconds}s exceeds threshold");
        }

        if (throughput < 100) // Requests per second
        {
            await RaiseIncident(IncidentSeverity.Low,
                "Throughput below expected levels");
        }
    }

    private async Task RaiseIncident(IncidentSeverity severity, string description)
    {
        var incident = new Incident
        {
            Id = Guid.NewGuid(),
            Severity = severity,
            Description = description,
            DetectedAt = DateTime.UtcNow,
            Service = "AssetService"
        };

        _logger.LogError("Incident detected: {Severity} - {Description}",
            severity, description);

        // Send alerts
        await SendPagerDutyAlert(incident);
        await SendSlackNotification(incident);
        await CreateIncidentTicket(incident);
    }
}

Runbook: Service Unavailable

## Service Unavailable Runbook

### Symptoms
- HTTP 503 errors
- Health check failures
- No response from service endpoints

### Immediate Actions
1. Check service status:
   ```bash
   kubectl get pods -l app=asset-service
   kubectl describe pod <pod-name>
  1. Check recent deployments:

    kubectl rollout history deployment/asset-service
    
  2. Check resource usage:

    kubectl top pods -l app=asset-service
    

Diagnosis

  1. Review logs:

    kubectl logs -l app=asset-service --tail=100
    
  2. Check database connectivity:

    kubectl exec -it <pod-name> -- nc -zv database-server 1433
    
  3. Review recent changes:

    git log --oneline -10
    

Resolution

  1. If pod is crashed:

    kubectl delete pod <pod-name>
    
  2. If deployment issue:

    kubectl rollout undo deployment/asset-service
    
  3. If resource exhaustion:

    kubectl scale deployment/asset-service --replicas=5
    

Post-Incident

  1. Document root cause
  2. Update monitoring thresholds
  3. Add preventive measures
  4. Schedule post-mortem meeting

## 🔧 Performance Tuning

### Database Optimization

```csharp
// Connection pooling configuration
services.AddDbContext<AssetContext>(options =>
{
    options.UseSqlServer(connectionString, sqlOptions =>
    {
        sqlOptions.EnableRetryOnFailure(
            maxRetryCount: 5,
            maxRetryDelay: TimeSpan.FromSeconds(30),
            errorNumbersToAdd: null);

        sqlOptions.CommandTimeout(30);
    });
}, ServiceLifetime.Scoped);

// Connection string with pooling
"Server=server;Database=db;Min Pool Size=10;Max Pool Size=100;Connection Lifetime=300;"

Caching Strategy

// Redis caching configuration
services.AddStackExchangeRedisCache(options =>
{
    options.Configuration = configuration.GetConnectionString("Redis");
    options.InstanceName = "Dynaplex";
});

// Memory caching with size limits
services.AddMemoryCache(options =>
{
    options.SizeLimit = 1000;
    options.CompactionPercentage = 0.25;
});

// Hybrid caching pattern
public class CachedAssetService : IAssetService
{
    private readonly IAssetService _innerService;
    private readonly IDistributedCache _distributedCache;
    private readonly IMemoryCache _memoryCache;

    public async Task<Asset> GetAssetAsync(Guid id)
    {
        // L1 Cache - Memory
        if (_memoryCache.TryGetValue($"asset_{id}", out Asset cached))
            return cached;

        // L2 Cache - Redis
        var redisKey = $"asset:{id}";
        var cachedJson = await _distributedCache.GetStringAsync(redisKey);
        if (!string.IsNullOrEmpty(cachedJson))
        {
            var asset = JsonSerializer.Deserialize<Asset>(cachedJson);
            _memoryCache.Set($"asset_{id}", asset,
                new MemoryCacheEntryOptions().SetSize(1));
            return asset;
        }

        // Database
        var dbAsset = await _innerService.GetAssetAsync(id);

        // Cache in both layers
        await _distributedCache.SetStringAsync(redisKey,
            JsonSerializer.Serialize(dbAsset),
            new DistributedCacheEntryOptions
            {
                SlidingExpiration = TimeSpan.FromMinutes(15)
            });

        _memoryCache.Set($"asset_{id}", dbAsset,
            new MemoryCacheEntryOptions()
                .SetSize(1)
                .SetSlidingExpiration(TimeSpan.FromMinutes(5)));

        return dbAsset;
    }
}

📊 Capacity Planning

Resource Monitoring

// Resource metrics collection
public class ResourceMonitor : BackgroundService
{
    private readonly ILogger<ResourceMonitor> _logger;

    protected override async Task ExecuteAsync(CancellationToken stoppingToken)
    {
        while (!stoppingToken.IsCancellationRequested)
        {
            var process = Process.GetCurrentProcess();

            var metrics = new
            {
                CpuUsage = GetCpuUsage(),
                MemoryUsage = process.WorkingSet64 / (1024 * 1024), // MB
                ThreadCount = process.Threads.Count,
                HandleCount = process.HandleCount,
                GcGen0 = GC.CollectionCount(0),
                GcGen1 = GC.CollectionCount(1),
                GcGen2 = GC.CollectionCount(2),
                GcMemory = GC.GetTotalMemory(false) / (1024 * 1024) // MB
            };

            _logger.LogInformation("Resource metrics: {@Metrics}", metrics);

            // Send to monitoring system
            await SendMetricsToPrometheus(metrics);

            await Task.Delay(TimeSpan.FromSeconds(30), stoppingToken);
        }
    }
}

Scaling Rules

# Horizontal Pod Autoscaler
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
  name: asset-service-hpa
spec:
  scaleTargetRef:
    apiVersion: apps/v1
    kind: Deployment
    name: asset-service
  minReplicas: 2
  maxReplicas: 10
  metrics:
  - type: Resource
    resource:
      name: cpu
      target:
        type: Utilization
        averageUtilization: 70
  - type: Resource
    resource:
      name: memory
      target:
        type: Utilization
        averageUtilization: 80
  - type: Pods
    pods:
      metric:
        name: http_requests_per_second
      target:
        type: AverageValue
        averageValue: "1000"

🔒 Security Operations

Security Monitoring

// Security event monitoring
public class SecurityMonitor
{
    private readonly ILogger<SecurityMonitor> _logger;

    public void MonitorAuthenticationEvents()
    {
        // Track failed login attempts
        services.AddSingleton<IAuthenticationEventHandler, AuthEventHandler>();
    }
}

public class AuthEventHandler : IAuthenticationEventHandler
{
    public Task HandleFailedAuthentication(AuthenticationFailedContext context)
    {
        _logger.LogWarning("Authentication failed: {User} from {IP}",
            context.Principal?.Identity?.Name,
            context.HttpContext.Connection.RemoteIpAddress);

        // Check for brute force
        if (IsBreachAttempt(context))
        {
            // Block IP
            await BlockIpAddress(context.HttpContext.Connection.RemoteIpAddress);

            // Send security alert
            await SendSecurityAlert("Possible brute force attack detected");
        }

        return Task.CompletedTask;
    }
}

📚 Operational Resources

Dashboards and Tools

  • Aspire Dashboard: http://localhost:15045
  • Grafana: http://localhost:3000
  • Prometheus: http://localhost:9090
  • Jaeger: http://localhost:16686
  • Kibana: http://localhost:5601

Useful Commands

# View service logs
kubectl logs -f deployment/asset-service

# Check service health
curl http://service-url/health

# View metrics
curl http://service-url/metrics

# Database connection test
kubectl exec -it pod-name -- sqlcmd -S server -U user -P pass -Q "SELECT 1"

# Force pod restart
kubectl rollout restart deployment/asset-service

# Scale deployment
kubectl scale deployment/asset-service --replicas=5

# View resource usage
kubectl top pods -l app=asset-service

Emergency Contacts

  • On-Call Engineer: Use PagerDuty
  • Platform Team: platform@acsis.com
  • Security Team: security@acsis.com
  • Database Team: dba@acsis.com

🎯 SLIs, SLOs, and SLAs

Service Level Indicators (SLIs)

public class SliMetrics
{
    public static readonly Gauge Availability = Metrics.CreateGauge(
        "service_availability_ratio",
        "Ratio of successful requests to total requests");

    public static readonly Histogram Latency = Metrics.CreateHistogram(
        "request_duration_seconds",
        "Request duration in seconds",
        new HistogramConfiguration
        {
            Buckets = Histogram.LinearBuckets(0.1, 0.1, 10)
        });

    public static readonly Counter Errors = Metrics.CreateCounter(
        "request_errors_total",
        "Total number of request errors");
}

Service Level Objectives (SLOs)

Metric Target Measurement Window
Availability 99.9% 30 days
Latency (p95) < 200ms 5 minutes
Latency (p99) < 1s 5 minutes
Error Rate < 0.1% 1 hour
Throughput > 1000 RPS 1 minute

Service Level Agreements (SLAs)

  • Uptime: 99.9% monthly availability
  • Response Time: 95% of requests under 200ms
  • Support: Critical issues resolved within 4 hours
  • Data Durability: 99.999999% (nine nines)