Skip to content

Commit

Permalink
feat: ensure OTel collector is able to export metrics and traces (#298)
Browse files Browse the repository at this point in the history
Signed-off-by: Lenin Mehedy <[email protected]>
  • Loading branch information
leninmehedy authored Sep 7, 2023
1 parent e04de57 commit 8c24733
Show file tree
Hide file tree
Showing 15 changed files with 3,385 additions and 108 deletions.
81 changes: 36 additions & 45 deletions charts/hedera-network/config-files/otel-collector-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,68 +2,59 @@ receivers:
otlp:
protocols:
grpc:
hostmetrics:
collection_interval: "15s"
scrapers:
cpu:
disk:
load:
filesystem:
memory:
network:
paging:
processes:
http:

# Collect node metrics
prometheus:
config:
scrape_configs:
- job_name: 'node-metrics-scraper' # network-node metrics
scrape_interval: {{ .otelDefaults.receivers.prometheus.scrapeInterval }}
static_configs:
- targets: {{ .otelDefaults.receivers.prometheus.scrapeTargets }}

processors:
batch:

exporters:
prometheus:
endpoint: "0.0.0.0:8889"
endpoint: "0.0.0.0:8889" # note: network-node-svc exposes this port for all prometheus metrics instead of node's port 9999
const_labels:
source: p-{{ default "otel-collector" .otelDefaults.nameOverride }} # PromQL: {source="p-otel-collector"}
{{- if eq .otelDefaults.prometheusRemoteWrite.enable "true" }}

{{- if eq .otelDefaults.exporters.prometheusRemoteWrite.enable "true" }}
prometheusremotewrite:
endpoint: "{{ .otelDefaults.prometheusRemoteWrite.endpoint }}"
endpoint: "{{ .otelDefaults.exporters.prometheusRemoteWrite.endpoint }}"
tls:
insecure: true
{{- .otelDefaults.exporters.prometheusRemoteWrite.tls | toYaml | nindent 6 }}
external_labels:
source: prw-{{ default "otel-collector" .otelDefaults.nameOverride }} # PromQL: {source="prw-otel-collector"}
{{- end }}
# logging:
# verbosity: detailed
# sampling_initial: 5

otlp:
endpoint: "{{ .otelDefaults.exporters.otlp.endpoint }}"
{{- if .otelDefaults.exporters.otlp.headers }}
headers:
{{- .otelDefaults.exporters.otlp.headers | toYaml | nindent 6 }}
{{- end }}
tls:
{{- .otelDefaults.exporters.otlp.tls | toYaml | nindent 6 }}

# jaeger:
# endpoint: jaeger-all-in-one:14250
# tls:
# insecure: true
#
# otlp:
# endpoint: tempo:4317
# tls:
# insecure: true

processors:
batch:
extensions:
health_check:

service:
# telemetry:
# logs:
# level: "debug"
# development: true
extensions: [health_check]
pipelines:
#traces:
# receivers: [otlp]
# processors: [batch]
# exporters: [logging, otlp, jaeger]
traces:
receivers: [otlp]
processors: [batch]
exporters: [otlp]
metrics:
receivers: [otlp, hostmetrics]
receivers: [prometheus]
processors: [batch]
exporters:
- prometheus
{{- if eq .otelDefaults.prometheusRemoteWrite.enable "true" }}
- prometheusremotewrite
{{- end }}
# exporters: [logging, prometheus{{ if eq .otelDefaults.prometheusRemoteWrite.enable "true" }}, prometheusremotewrite{{ end }}]
exporters:
- prometheus
{{- if eq .otelDefaults.exporters.prometheusRemoteWrite.enable "true" }}
- prometheusremotewrite
{{- end }}
11 changes: 6 additions & 5 deletions charts/hedera-network/templates/services/network-node-svc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,24 @@ spec:
selector:
app: network-{{ $nodeConfig.name }}
ports:
# GOSSIP port is used by nodes for gossip protocol
- name: gossip
protocol: TCP
port: 50111 # gossip port
targetPort: 50111
# GRPC-NON-TLS port is used for services API
- name: grpc-non-tls
protocol: TCP
port: 50211 # non-tls grpc client port
targetPort: 50211
# GRPC-TLS port
- name: grpc-tls
protocol: TCP
port: 50212 # tls grpc client port
targetPort: 50212
- name: otel-metrics
protocol: TCP
port: 8888
targetPort: 8888
- name: prometheus
# Node metrics port
# Rather than exposing the node's metrics port 9999 directly, we expose otel-collect's prometheus port here.
- name: prometheus # otel-collector's prometheus exporter port
protocol: TCP
port: 8889
targetPort: 8889
Expand Down
28 changes: 14 additions & 14 deletions charts/hedera-network/templates/sidecars/_otel-collector.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,26 @@
securityContext:
{{- include "fullstack.root.security.context" . | nindent 4 }}
ports:
- name: otel-health
containerPort: 13133
protocol: TCP
- name: otel-metrics
containerPort: 8888
protocol: TCP
- name: otel-otlp
- name: otlp # otel port defined in otel-collector config
containerPort: 4317
protocol: TCP
- name: prometheus
- name: prometheus # prometheus exporter port as specified in otel-collector-config.yaml
containerPort: 8889
protocol: TCP
{{- with default $defaults.livenessProbe $otel.livenessProbe }}
- name: health # for otel-collector liveness check
containerPort: 13133
protocol: TCP
- name: metrics # default metrics port exposed by the otel-collector itself
containerPort: 8888
protocol: TCP
livenessProbe:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- with default $defaults.readinessProbe $otel.readinessProbe }}
httpGet:
path: /
port: health
readinessProbe:
{{- toYaml . | nindent 4 }}
{{- end }}
httpGet:
path: /
port: health
volumeMounts:
- name: otel-collector-volume
mountPath: /etc/otelcol-contrib/config.yaml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ spec:
matchLabels:
fullstack.hedera.com/type: network-node-svc
endpoints:
- port: otel-metrics
interval: 5s
- port: prometheus
interval: 5s
- port: prometheus # must match the prometheus port-name in network-node-svc.yaml
interval: 10s # ideally it should be higher than the node-metrics-scraper interval set in otel-collector-config.yaml
{{- end }}
28 changes: 17 additions & 11 deletions charts/hedera-network/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,18 +161,24 @@ defaults:
repository: "otel/opentelemetry-collector-contrib"
tag: "0.72.0"
pullPolicy: "IfNotPresent"
livenessProbe:
httpGet:
path: /
port: otel-health
readinessProbe:
httpGet:
path: /
port: otel-health
resources: {}
prometheusRemoteWrite:
enable: "false"
endpoint: "http://prometheus.default.svc:9090/api/v1/write"
receivers:
prometheus:
scrapeTargets: [ 0.0.0.0:9999 ] # hedera node metrics are exposed at port 9090
scrapeInterval: 5s
exporters:
otlp:
endpoint: tempo:4317
tls:
insecure: true
prometheus:
tls:
insecure: true
prometheusRemoteWrite:
enable: "false"
endpoint: "http://prometheus.default.svc:9090/api/v1/write"
tls:
insecure: true

# This configures the minio tenant subchart
# Reference for configuration: https://github.com/minio/operator/blob/master/helm/tenant/values.yaml
Expand Down
31 changes: 27 additions & 4 deletions dev/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ deploy-network: deploy-chart
kubectl get pods -o wide && \
echo ">> Waiting for network-node pods to be active (first deployment takes ~10m)...." && \
kubectl wait --for=jsonpath='{.status.phase}'=Running pod -l fullstack.hedera.com/type=network-node --timeout=900s
echo "" && \
echo ">> Service Information...." && \
echo "" && \
kubectl get svc -o wide && \
echo ">> Waiting for pods to be up (timeout 600s)" && \
kubectl wait --for=jsonpath='{.status.phase}'=Running pod -l fullstack.hedera.com/type=network-node --timeout=600s

.PHONY: destroy-network
destroy-network: destroy-test-container destroy-chart
Expand Down Expand Up @@ -151,7 +157,7 @@ destroy-gateway-api:
#source "${SCRIPTS_DIR}/${GATEWAY_API_SCRIPT}" && destroy_haproxy_ingress
source "${SCRIPTS_DIR}/${GATEWAY_API_SCRIPT}" && destroy_envoy_gateway_api

######################################### Prometheus #################################
######################################### Telemetry #################################
.PHONY: fetch-prometheus-operator-bundle
fetch-prometheus-operator-bundle:
source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && fetch-prometheus-operator-bundle
Expand All @@ -164,14 +170,31 @@ deploy-prometheus-operator: fetch-prometheus-operator-bundle
destroy-prometheus-operator:
source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && destroy-prometheus-operator

.PHONY: deploy-grafana-tempo
deploy-grafana-tempo:
source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && deploy_grafana_tempo && expose_grafana

.PHONY: destroy-grafana-tempo
destroy-grafana-tempo:
source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && unexpose_grafana && destroy_grafana_tempo

.PHONY: deploy-prometheus
deploy-prometheus: deploy-prometheus-operator
source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && deploy-prometheus
source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && deploy-prometheus && expose_prometheus

.PHONY: destroy-prometheus
destroy-prometheus:
-source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && destroy-prometheus
make destroy-prometheus-operator
-source "${SCRIPTS_DIR}/${TELEMETRY_SCRIPT}" && unexpose_prometheus && destroy-prometheus
-${MAKE} destroy-prometheus-operator

.PHONY: deploy-telemetry-stack
deploy-telemetry-stack: deploy-prometheus deploy-grafana-tempo

.PHONY: destroy-telemetry-stack
destroy-telemetry-stack:
# Note: - prefix ensures errors are ignored and continues
-${MAKE} destroy-prometheus
-${MAKE} destroy-grafana-tempo

######################################### MinIO #################################
.PHONY: deploy-minio-operator
Expand Down
82 changes: 80 additions & 2 deletions dev/scripts/telemetry.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ function deploy-prometheus() {
kubectl create -f "${PROMETHEUS_RBAC_YAML}"
sleep 10
kubectl create -f "${PROMETHEUS_YAML}"
echo "Waiting for prometheus to be active..."
echo "Waiting for prometheus to be active (timeout 300s)..."
kubectl wait --for=condition=Ready pods -l app.kubernetes.io/name=prometheus -n default --timeout 300s
}

Expand All @@ -82,7 +82,8 @@ function deploy-prometheus-example-app() {
echo "PROMETHEUS_EXAMPLE_APP_YAML: ${PROMETHEUS_EXAMPLE_APP_YAML}"
echo "-----------------------------------------------------------------------------------------------------"
kubectl create -f "${PROMETHEUS_EXAMPLE_APP_YAML}"
kubectl wait --for=condition=Ready pods -l app=prometheus-example-app -n default --timeout 60s
echo "Waiting for prometheus example app to be active (timeout 300s)..."
kubectl wait --for=condition=Ready pods -l app=prometheus-example-app -n default --timeout 300s
}

function destroy-prometheus-example-app() {
Expand All @@ -94,3 +95,80 @@ function destroy-prometheus-example-app() {
local status="$?"
[[ "${status}" = 0 ]] && sleep 10
}

function expose_prometheus() {
export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=prometheus,app.kubernetes.io/instance=prometheus" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace default port-forward $POD_NAME 9090 &
echo "Prometheus is exposed from ${POD_NAME} to port 9090"
}

function unexpose_prometheus() {
export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=prometheus,app.kubernetes.io/instance=prometheus" -o jsonpath="{.items[0].metadata.name}")
export PID=$(ps aux | grep "port-forward ${POD_NAME}" | sed -n 2p | awk '{ print $2 }')
[[ -z "${PID}" ]] && echo "No Prometheus port-forward PID is found" && return 0

if [[ "${PID}" ]]; then
echo ""
echo "Un-exposing Prometheus: ${POD_NAME} for PID: ${PID}"
echo "-----------------------------------------------------------------------------------------------------"
kill "${PID}" &>/dev/null || true
fi
}

function deploy_grafana_tempo() {
echo ""
echo "Deploying Grafana"
echo "-----------------------------------------------------------------------------------------------------"
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
helm upgrade --install tempo grafana/tempo
echo "Waiting for tempo to be active (timeout 300s)..."
kubectl wait --for=jsonpath='{.status.phase}'=Running pod -l "app.kubernetes.io/name=tempo,app.kubernetes.io/instance=tempo" --timeout=300s

helm upgrade -f "${TELEMETRY_DIR}/grafana/grafana-values.yaml" --install grafana grafana/grafana
echo "Waiting for grafana to be active (timeout 300s)..."
kubectl wait --for=jsonpath='{.status.phase}'=Running pod -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" --timeout=300s
}

function destroy_grafana_tempo() {
echo ""
echo "Destroying Grafana"
echo "-----------------------------------------------------------------------------------------------------"
helm delete grafana
helm delete tempo
}

function expose_grafana() {
export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}")
kubectl --namespace default port-forward $POD_NAME 3000 &
echo "Grafana is exposed from ${POD_NAME} to port 3000"
}

function unexpose_grafana() {
export POD_NAME=$(kubectl get pods --namespace default -l "app.kubernetes.io/name=grafana,app.kubernetes.io/instance=grafana" -o jsonpath="{.items[0].metadata.name}")
export PID=$(ps aux | grep "port-forward ${POD_NAME}" | sed -n 2p | awk '{ print $2 }')
[[ -z "${PID}" ]] && echo "No Grafana port-forward PID is found" && return 0

if [[ "${PID}" ]]; then
echo ""
echo "Un-exposing Grafana: ${POD_NAME} for PID: ${PID}"
echo "-----------------------------------------------------------------------------------------------------"
kill "${PID}" &>/dev/null || true
fi
}

function deploy_tracing_example_app() {
echo ""
echo "Deploying Example Tracing App"
echo "-----------------------------------------------------------------------------------------------------"
kubectl create -f "${TELEMETRY_DIR}/grafana/example-tracing-app.yaml"
echo "Waiting for tracing example app to be active (timeout 300s)..."
kubectl wait --for=condition=Ready pods -l app=xk6-tracing -n default --timeout 300s
}

function destroy_tracing_example_app() {
echo ""
echo "Destroying Example Tracing App"
echo "-----------------------------------------------------------------------------------------------------"
kubectl delete -f "${TELEMETRY_DIR}/grafana/example-tracing-app.yaml"
}
Loading

0 comments on commit 8c24733

Please sign in to comment.