Announcement

👇Official Account👇

图片

Welcome to join the group & private message

Article first/tail QR code

Skip to content

从 trace 到洞察:Go 项目的可观测性闭环实践

可观测性是现代应用的核心能力。本文带你构建完整的 Go 项目可观测性体系。

一、可观测性三大支柱

1.1 日志(Logging)

记录离散事件,用于问题排查。

go
// 结构化日志
log.Info("user login",
    zap.String("user_id", userID),
    zap.String("ip", clientIP),
    zap.Duration("latency", latency),
)

1.2 指标(Metrics)

记录可聚合的数值,用于监控告警。

go
// 计数器
requestCounter.WithLabelValues(path, status).Inc()

// 直方图
requestDuration.WithLabelValues(path).Observe(duration.Seconds())

1.3 追踪(Tracing)

记录请求链路,用于性能分析。

go
// 创建 span
ctx, span := tracer.Start(ctx, "process_order")
defer span.End()

span.SetAttributes(attribute.String("order_id", orderID))

二、OpenTelemetry 集成

2.1 初始化

go
package telemetry

import (
    "go.opentelemetry.io/otel"
    "go.opentelemetry.io/otel/exporters/jaeger"
    "go.opentelemetry.io/otel/sdk/resource"
    sdktrace "go.opentelemetry.io/otel/sdk/trace"
    semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
)

func InitTracer(serviceName string) (*sdktrace.TracerProvider, error) {
    // 创建 Jaeger 导出器
    exp, err := jaeger.New(jaeger.WithCollectorEndpoint(
        jaeger.WithEndpoint("http://localhost:14268/api/traces"),
    ))
    if err != nil {
        return nil, err
    }
    
    // 创建资源
    res, err := resource.New(
        context.Background(),
        resource.WithAttributes(
            semconv.ServiceNameKey.String(serviceName),
            semconv.ServiceVersionKey.String("1.0.0"),
        ),
    )
    if err != nil {
        return nil, err
    }
    
    // 创建 TracerProvider
    tp := sdktrace.NewTracerProvider(
        sdktrace.WithBatcher(exp),
        sdktrace.WithResource(res),
    )
    
    otel.SetTracerProvider(tp)
    return tp, nil
}

2.2 HTTP 中间件

go
func TracingMiddleware() gin.HandlerFunc {
    return func(c *gin.Context) {
        tracer := otel.Tracer("http-server")
        
        // 从请求中提取 span context
        ctx := otel.GetTextMapPropagator().Extract(c.Request.Context(),
            propagation.HeaderCarrier(c.Request.Header),
        )
        
        // 创建 span
        ctx, span := tracer.Start(ctx, c.Request.Method+" "+c.FullPath())
        defer span.End()
        
        // 设置属性
        span.SetAttributes(
            attribute.String("http.method", c.Request.Method),
            attribute.String("http.url", c.Request.URL.String()),
            attribute.String("http.user_agent", c.Request.UserAgent()),
        )
        
        // 将 context 存入 gin
        c.Request = c.Request.WithContext(ctx)
        
        c.Next()
        
        // 记录响应状态
        span.SetAttributes(attribute.Int("http.status_code", c.Writer.Status()))
        if c.Writer.Status() >= 500 {
            span.SetStatus(codes.Error, "server error")
        }
    }
}

三、数据库追踪

go
import (
    "github.com/XSAM/otelsql"
    semconv "go.opentelemetry.io/otel/semconv/v1.4.0"
)

func InitDB(dsn string) (*sql.DB, error) {
    // 使用 otelsql 包装数据库驱动
    db, err := otelsql.Open("mysql", dsn,
        otelsql.WithAttributes(semconv.DBSystemMySQL),
    )
    if err != nil {
        return nil, err
    }
    
    return db, nil
}

四、指标收集

4.1 Prometheus 集成

go
package metrics

import (
    "github.com/prometheus/client_golang/prometheus"
    "github.com/prometheus/client_golang/prometheus/promauto"
)

var (
    RequestCounter = promauto.NewCounterVec(
        prometheus.CounterOpts{
            Name: "http_requests_total",
            Help: "Total number of HTTP requests",
        },
        []string{"method", "path", "status"},
    )
    
    RequestDuration = promauto.NewHistogramVec(
        prometheus.HistogramOpts{
            Name:    "http_request_duration_seconds",
            Help:    "HTTP request duration in seconds",
            Buckets: prometheus.DefBuckets,
        },
        []string{"method", "path"},
    )
    
    ActiveConnections = promauto.NewGauge(
        prometheus.GaugeOpts{
            Name: "active_connections",
            Help: "Number of active connections",
        },
    )
)

4.2 指标中间件

go
func MetricsMiddleware() gin.HandlerFunc {
    return func(c *gin.Context) {
        start := time.Now()
        
        metrics.ActiveConnections.Inc()
        defer metrics.ActiveConnections.Dec()
        
        c.Next()
        
        duration := time.Since(start)
        status := strconv.Itoa(c.Writer.Status())
        
        metrics.RequestCounter.WithLabelValues(
            c.Request.Method,
            c.FullPath(),
            status,
        ).Inc()
        
        metrics.RequestDuration.WithLabelValues(
            c.Request.Method,
            c.FullPath(),
        ).Observe(duration.Seconds())
    }
}

五、日志追踪关联

go
func LoggerMiddleware() gin.HandlerFunc {
    return func(c *gin.Context) {
        start := time.Now()
        
        // 生成 trace ID
        traceID := ""
        span := trace.SpanFromContext(c.Request.Context())
        if span.SpanContext().IsValid() {
            traceID = span.SpanContext().TraceID().String()
        }
        
        // 将 trace ID 存入 context
        ctx := context.WithValue(c.Request.Context(), "trace_id", traceID)
        c.Request = c.Request.WithContext(ctx)
        
        c.Next()
        
        // 记录访问日志
        logger.Info("http request",
            zap.String("trace_id", traceID),
            zap.String("method", c.Request.Method),
            zap.String("path", c.Request.URL.Path),
            zap.Int("status", c.Writer.Status()),
            zap.Duration("duration", time.Since(start)),
            zap.String("client_ip", c.ClientIP()),
        )
    }
}

六、告警配置

yaml
# PrometheusRule
groups:
  - name: api-alerts
    rules:
      - alert: HighErrorRate
        expr: |
          (
            sum(rate(http_requests_total{status=~"5.."}[5m]))
            /
            sum(rate(http_requests_total[5m]))
          ) > 0.05
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High error rate detected"
          
      - alert: HighLatency
        expr: |
          histogram_quantile(0.95, 
            sum(rate(http_request_duration_seconds_bucket[5m])) by (le)
          ) > 1.0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High latency detected"

七、总结

组件工具用途
追踪OpenTelemetry + Jaeger链路分析
指标Prometheus监控告警
日志ELK / Loki日志分析
可视化Grafana统一展示

构建完整的可观测性体系,让问题无处遁形。

上次更新于: