Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat: Enhance reconcile process to fix inconsistency between etcd cluster and statefullset #50

Closed
wants to merge 9 commits into from
36 changes: 36 additions & 0 deletions .github/workflows/test-e2e-failpoint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: E2E Failpoint Tests

on:
push:
pull_request:

jobs:
test-e2e:
name: Run on Ubuntu
runs-on: ubuntu-latest
steps:
- name: Clone the code
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683

- name: Setup Go
uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a
with:
go-version-file: 'go.mod'

- name: Install the latest version of kind
run: |
curl -Lo ./kind https://kind.sigs.k8s.io/dl/latest/kind-linux-amd64
chmod +x ./kind
sudo mv ./kind /usr/local/bin/kind

- name: Verify kind installation
run: kind version

- name: Create kind cluster
run: kind create cluster

- name: Running Test e2e
run: |
go mod tidy
make gofail-enable
make test-e2e-failpoint
2 changes: 2 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ linters-settings:
revive:
rules:
- name: comment-spacings
gocyclo:
min-complexity: 40
27 changes: 26 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,19 @@ test-e2e: manifests generate fmt vet ## Run the e2e tests. Expected an isolated
echo "No Kind cluster is running. Please start a Kind cluster before running the e2e tests."; \
exit 1; \
}
go test ./test/e2e/ -v -ginkgo.v
go test ./test/e2e/ -v -ginkgo.v -ginkgo.label-filter="!failpoint"

.PHONY: test-e2e-failpoint
test-e2e-failpoint: manifests generate fmt vet ## Run the e2e tests using gofail. Expected an isolated environment using Kind.
@command -v kind >/dev/null 2>&1 || { \
echo "Kind is not installed. Please install Kind manually."; \
exit 1; \
}
@kind get clusters | grep -q 'kind' || { \
echo "No Kind cluster is running. Please start a Kind cluster before running the e2e tests."; \
exit 1; \
}
go test ./test/e2e/ -v -ginkgo.v -ginkgo.label-filter="failpoint"

.PHONY: lint
lint: golangci-lint ## Run golangci-lint linter
Expand Down Expand Up @@ -166,6 +178,15 @@ api-docs: crd-ref-docs ## Generate api references docs.
--templates-dir=./docs/api-references/template/ \
--config=./docs/api-references/config.yaml

##@ gofail
.PHONY: gofail-enable
gofail-enable: gofail
gofail enable .

.PHONY: gofail-disable
gofail-disable: gofail
gofail disable .

##@ Dependencies

## Location to install dependencies to
Expand Down Expand Up @@ -213,6 +234,10 @@ golangci-lint: $(GOLANGCI_LINT) ## Download golangci-lint locally if necessary.
$(GOLANGCI_LINT): $(LOCALBIN)
$(call go-install-tool,$(GOLANGCI_LINT),github.com/golangci/golangci-lint/cmd/golangci-lint,$(GOLANGCI_LINT_VERSION))

.PHONY: gofail
gofail:
go install go.etcd.io/gofail

# go-install-tool will 'go install' any package with custom target and name of binary, if it doesn't exist
# $1 - target path with name of binary
# $2 - package url which can be installed
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ require (
go.etcd.io/etcd/client/pkg/v3 v3.5.17
go.etcd.io/etcd/client/v3 v3.5.17
go.etcd.io/etcd/server/v3 v3.5.17
go.etcd.io/gofail v0.2.0
go.uber.org/zap v1.27.0
k8s.io/api v0.32.1
k8s.io/apimachinery v0.32.1
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ go.etcd.io/etcd/raft/v3 v3.5.17 h1:wHPW/b1oFBw/+HjDAQ9vfr17OIInejTIsmwMZpK1dNo=
go.etcd.io/etcd/raft/v3 v3.5.17/go.mod h1:uapEfOMPaJ45CqBYIraLO5+fqyIY2d57nFfxzFwy4D4=
go.etcd.io/etcd/server/v3 v3.5.17 h1:xykBwLZk9IdDsB8z8rMdCCPRvhrG+fwvARaGA0TRiyc=
go.etcd.io/etcd/server/v3 v3.5.17/go.mod h1:40sqgtGt6ZJNKm8nk8x6LexZakPu+NDl/DCgZTZ69Cc=
go.etcd.io/gofail v0.2.0 h1:p19drv16FKK345a09a1iubchlw/vmRuksmRzgBIGjcA=
go.etcd.io/gofail v0.2.0/go.mod h1:nL3ILMGfkXTekKI3clMBNazKnjUZjYLKmBHzsVAnC1o=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0 h1:9G6E0TXzGFVfTnawRzrPl83iHOAV7L8NJiR8RSGYV1g=
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.53.0/go.mod h1:azvtTADFQJA8mX80jIH/akaE7h+dbm/sVuaHqN13w74=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA=
Expand Down
40 changes: 29 additions & 11 deletions internal/controller/etcdcluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,21 +142,32 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}
targetReplica := *sts.Spec.Replicas // Start with the current size of the stateful set

// TODO: finish the logic later
// The number of replicas in the StatefulSet doesn't match the number of etcd members in the cluster.
if int(targetReplica) != memberCnt {
// TODO: finish the logic later
// nolint:staticcheck // Temporarily disable staticcheck
logger.Info("The expected number of replicas doesn't match the number of etcd members in the cluster", "targetReplica", targetReplica, "memberCnt", memberCnt)
if int(targetReplica) < memberCnt {
// a new added learner hasn't started yet

// re-generate configuration for the new learner member;
// increase statefulsets's replica by 1
// A new member has been added to the etcd cluster
// but the corresponding Pod hasn't been created yet in the StatefulSet.
// Increase the StatefulSet replicas by 1 to match the new cluster member.
newReplicaCount := targetReplica + 1
logger.Info("Increasing StatefulSet replicas to match the new etcd learner.", "oldReplicaCount", targetReplica, "newReplicaCount", newReplicaCount)
_, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, newReplicaCount, r.Scheme)
if err != nil {
return ctrl.Result{}, err
}
} else {
// an already removed member hasn't stopped yet.

// Decrease the statefulsets's replica by 1
// A member has been removed from the etcd cluster
// but the corresponding Pod is still running.
// Decrease the StatefulSet replicas by 1 to remove the unneeded Pod.
logger.Info("An etcd member was removed from the cluster, but the StatefulSet hasn't scaled down yet.")
newReplicaCount := targetReplica - 1
logger.Info("Decreasing StatefulSet replicas to remove the unneeded Pod.", "oldReplicaCount", targetReplica, "newReplicaCount", newReplicaCount)
_, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, newReplicaCount, r.Scheme)
if err != nil {
return ctrl.Result{}, err
}
}
// return
return ctrl.Result{RequeueAfter: requeueDuration}, nil
}

var (
Expand Down Expand Up @@ -214,6 +225,8 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if _, err := etcdutils.AddMember(eps, []string{peerURL}, true); err != nil {
return ctrl.Result{}, err
}
// We will interrupt this state and crash the operator before updating the StatefulSet replicas.
// gofail: var CrashAfterAddMember struct{}

logger.Info("Learner member added successfully", "peerURLs", peerURL)
} else {
Expand All @@ -228,6 +241,11 @@ func (r *EtcdClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request)
if err := etcdutils.RemoveMember(eps, memberID); err != nil {
return ctrl.Result{}, err
}

// We will interrupt this state and crash the operator before updating the StatefulSet replicas.
// gofail: var CrashAfterRemoveMember struct{}

logger.Info("Member removed successfully", "memberID", memberID)
}

sts, err = reconcileStatefulSet(ctx, logger, etcdCluster, r.Client, targetReplica, r.Scheme)
Expand Down
Loading