Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Calculating and updtating cluster and volume alert_count in each cluster sync #599

Merged
merged 3 commits into from
Apr 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions tendrl/gluster_integration/objects/definition/gluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,18 @@ namespace.gluster:
volume_id:
help: "Id of the volume"
type: String
relationship:
utilization:
- volume_utilization
status:
- volume_status
- volume_state
- brick_status
- quorum
- ec_min_bricks_up
- afr_quorum_state
- afr_subvol_state
- georep_status
value: /clusters/{0}/Volumes/{1}
list: /clusters/{0}/Volumes/{1}
help: "Volume Alert Counter"
Expand Down
91 changes: 49 additions & 42 deletions tendrl/gluster_integration/sds_sync/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,21 +76,6 @@ def run(self):
NS.publisher_id,
{"message": "Failed to sync cluster network details"}
)

if NS.tendrl_context.integration_id:
# Initialize alert node alert count
try:
key = 'clusters/%s/nodes/%s/alert_counters' % (
NS.tendrl_context.integration_id,
NS.node_context.node_id
)
etcd_utils.read(key)
except(etcd.EtcdException)as ex:
if type(ex) == etcd.EtcdKeyNotFound:
NS.tendrl.objects.ClusterNodeAlertCounters(
node_id=NS.node_context.node_id,
integration_id=NS.tendrl_context.integration_id
).save()
_sleep = 0
while not self._complete.is_set():
# To detect out of band deletes
Expand Down Expand Up @@ -294,7 +279,8 @@ def run(self):
"sync_interval", 10
)) + len(volumes) * 4
)

# update alert count
update_cluster_alert_count()
# check and enable volume profiling
if "provisioner/%s" % NS.tendrl_context.integration_id in \
NS.node_context.tags:
Expand All @@ -318,17 +304,6 @@ def run(self):
) in ['', 'finished', 'failed'] and \
_cluster.status in [None, ""]:
_cluster.save()
# Initialize alert count
try:
alerts_count_key = '/clusters/%s/alert_counters' % (
NS.tendrl_context.integration_id)
etcd_utils.read(alerts_count_key)
except(etcd.EtcdException)as ex:
if type(ex) == etcd.EtcdKeyNotFound:
NS.tendrl.objects.ClusterAlertCounters(
integration_id=NS.tendrl_context.integration_id
).save()

except Exception as ex:
Event(
ExceptionMessage(
Expand Down Expand Up @@ -542,21 +517,6 @@ def sync_volumes(volumes, index, vol_options, sync_ttl):
}
)
volume.save(ttl=sync_ttl)

# Initialize volume alert count
try:
volume_alert_count_key = '/clusters/%s/Volumes/%s/'\
'alert_counters' % (
NS.tendrl_context.integration_id,
volumes['volume%s.id' % index]
)
etcd_utils.read(volume_alert_count_key)
except(etcd.EtcdException)as ex:
if type(ex) == etcd.EtcdKeyNotFound:
NS.gluster.objects.VolumeAlertCounters(
integration_id=NS.tendrl_context.integration_id,
volume_id=volumes['volume%s.id' % index]
).save()
# Save the default values of volume options
vol_opt_dict = {}
for opt_count in \
Expand Down Expand Up @@ -861,3 +821,50 @@ def brick_status_alert(hostname):
finally:
if isinstance(lock, etcd.lock.Lock) and lock.is_acquired:
lock.release()


def update_cluster_alert_count():
cluster_alert_count = 0
severity = ["WARNING", "CRITICAL"]
try:
alert_counts = get_volume_alert_counts()
alerts = NS.tendrl.objects.ClusterAlert(
tags={'integration_id': NS.tendrl_context.integration_id}
).load_all()
for alert in alerts:
alert.tags = json.loads(alert.tags)
if alert.severity in severity:
cluster_alert_count += 1
if alert.resource in NS.gluster.objects.VolumeAlertCounters(
)._defs['relationship'][alert.alert_type.lower()]:
vol_name = alert.tags.get('volume_name', None)
if vol_name and vol_name in alert_counts.keys():
alert_counts[vol_name]['alert_count'] += 1
# Update cluster alert count
NS.tendrl.objects.ClusterAlertCounters(
integration_id=NS.tendrl_context.integration_id,
alert_count=cluster_alert_count
).save()
# Update volume alert count
for volume, vol_dict in alert_counts.iteritems():
NS.gluster.objects.VolumeAlertCounters(
integration_id=NS.tendrl_context.integration_id,
alert_count=vol_dict['alert_count'],
volume_id=vol_dict['vol_id']
).save()
except (etcd.EtcdException, AttributeError) as ex:
logger.log(
"debug",
NS.publisher_id,
{"message": "Unable to update alert count.err: %s" % ex}
)


def get_volume_alert_counts():
alert_counts = {}
volumes = NS.gluster.objects.Volume().load_all()
for volume in volumes:
alert_counts[volume.name] = {'vol_id': volume.vol_id,
'alert_count': 0
}
return alert_counts