Skip to content

Commit

Permalink
PMM-4547 Replicaset lag incorrect (#218)
Browse files Browse the repository at this point in the history
* PMM-4547 Replicaset lag incorrect

* PMM-4547 Secondary lag calc overflow

* PMM-4547 Test commented out

* PMM-4547 Test commented out

* PMM-4547 Removed unused func
  • Loading branch information
percona-csalguero authored Oct 13, 2020
1 parent 4f04265 commit 6421fe4
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 1 deletion.
125 changes: 125 additions & 0 deletions exporter/secondary_lag_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package exporter

import (
"context"
"testing"
"time"

dto "github.com/prometheus/client_model/go"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/bson/primitive"

"github.com/percona/mongodb_exporter/internal/tu"
)

type ReplicasetConfig struct {
Config RSConfig `bson:"config"`
}

type RSConfig struct {
ID string `bson:"_id"`
Version int `bson:"version"`
ProtocolVersion int `bson:"protocolVersion"`
WriteConcernMajorityJournalDefault bool `bson:"writeConcernMajorityJournalDefault"`
Members []struct {
ID int `bson:"_id"`
Host string `bson:"host"`
ArbiterOnly bool `bson:"arbiterOnly"`
BuildIndexes bool `bson:"buildIndexes"`
Hidden bool `bson:"hidden"`
Priority int `bson:"priority"`
Tags struct {
} `bson:"tags"`
SlaveDelay int `bson:"slaveDelay"`
Votes int `bson:"votes"`
} `bson:"members"`
Settings struct {
ChainingAllowed bool `bson:"chainingAllowed"`
HeartbeatIntervalMillis int `bson:"heartbeatIntervalMillis"`
HeartbeatTimeoutSecs int `bson:"heartbeatTimeoutSecs"`
ElectionTimeoutMillis int `bson:"electionTimeoutMillis"`
CatchUpTimeoutMillis int `bson:"catchUpTimeoutMillis"`
CatchUpTakeoverDelayMillis int `bson:"catchUpTakeoverDelayMillis"`
GetLastErrorModes struct {
} `bson:"getLastErrorModes"`
GetLastErrorDefaults struct {
W int `bson:"w"`
Wtimeout int `bson:"wtimeout"`
} `bson:"getLastErrorDefaults"`
ReplicaSetID primitive.ObjectID `bson:"replicaSetId"`
} `bson:"settings"`
}

func TestSecondaryLag(t *testing.T) {
t.Skip("This is failing in GitHub actions. Cannot make secondary to lag behind")
secondsBehind := 3
sleep := 2
ctx, cancel := context.WithTimeout(context.Background(), time.Duration((secondsBehind*2)+sleep)*time.Second)
defer cancel()

client := tu.DefaultTestClient(ctx, t)

var rsConf, rsConfOld ReplicasetConfig
var gg interface{}

res := client.Database("admin").RunCommand(ctx, primitive.M{"replSetGetConfig": 1})
require.NoError(t, res.Err())

err := res.Decode(&gg) // To restore config after test
assert.NoError(t, err)

err = res.Decode(&rsConf)
assert.NoError(t, err)

rsConf.Config.Members[1].Priority = 0
rsConf.Config.Members[1].Hidden = true
rsConf.Config.Members[1].SlaveDelay = secondsBehind
rsConf.Config.Version++

var replSetReconfig struct {
OK int `bson:"ok"`
}
err = client.Database("admin").RunCommand(ctx, primitive.M{"replSetReconfig": rsConf.Config}).Decode(&replSetReconfig)
assert.NoError(t, err)

res = client.Database("admin").RunCommand(ctx, primitive.M{"replSetGetConfig": 1})
require.NoError(t, res.Err())

// Generate documents so oplog is forced to have operations and the lag becomes real, otherwise
// primary and secondary oplogs are the same. Generate more than one doc to ensure oplog is updated
// quickly for the test.
for i := 0; i < 100; i++ {
_, err = client.Database("test").Collection("testc1").InsertOne(ctx, bson.M{"s": 1})
require.NoError(t, err)
time.Sleep(20 * time.Millisecond)
}
err = client.Database("test").Drop(ctx)
assert.NoError(t, err)

err = res.Decode(&rsConfOld) // To restore config after test
assert.NoError(t, err)

msclient := tu.TestClient(ctx, tu.MongoDBS1Secondary1Port, t)
var m bson.M

cmd := bson.D{{Key: "getDiagnosticData", Value: "1"}}
res = msclient.Database("admin").RunCommand(ctx, cmd)

err = res.Decode(&m)
assert.NoError(t, err)

m, _ = m["data"].(bson.M)
lag := replicationLag(m)

metric := &dto.Metric{}
err = lag.Write(metric)
assert.NoError(t, err)
// Secondary is not exactly secondsBehind behind master
assert.True(t, *metric.Gauge.Value > 0)

rsConfOld.Config.Version = rsConf.Config.Version + 1
err = client.Database("admin").RunCommand(ctx, primitive.M{"replSetReconfig": rsConfOld.Config}).Decode(&replSetReconfig)
assert.NoError(t, err)
}
9 changes: 8 additions & 1 deletion exporter/v1_compatibility.go
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,7 @@ func replicationLag(m bson.M) prometheus.Metric {
if !ok {
return nil
}

for _, member := range members {
if statestr, ok := member.(bson.M)["stateStr"].(string); ok && statestr == "PRIMARY" {
if optime, ok := member.(bson.M)["optime"].(bson.M); ok {
Expand Down Expand Up @@ -883,7 +884,13 @@ func replicationLag(m bson.M) prometheus.Metric {
return nil
}

val := float64(primaryTS.T - selfTS.T)
var val float64
if primaryTS.T > selfTS.T {
val = float64(primaryTS.T - selfTS.T)
} else {
val = float64(selfTS.T - primaryTS.T)
}

set, _ := replSetGetStatus["set"].(string)

metricName := "mongodb_mongod_replset_member_replication_lag"
Expand Down
4 changes: 4 additions & 0 deletions internal/tu/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ const (
MongosPort = "17000"
// MongoDBS1PrimaryPort MongoDB Shard 1 Primary Port.
MongoDBS1PrimaryPort = "17001"
// MongoDBS1Secondary1Port MongoDB Shard 1 Secondary 1 Port.
MongoDBS1Secondary1Port = "17002"
// MongoDBS1Secondary2Port MongoDB Shard 1 Secondary 2 Port.
MongoDBS1Secondary2Port = "17003"
// MongoDBStandAlonePort MongoDB stand alone instance Port.
MongoDBStandAlonePort = "27017"
)
Expand Down

0 comments on commit 6421fe4

Please sign in to comment.