Skip to content

Commit

Permalink
FO 3.2.1.831
Browse files Browse the repository at this point in the history
FO 3.2.1.831
  • Loading branch information
GitTorre authored Aug 22, 2022
2 parents 8dacf88 + 0c49b16 commit 723e318
Show file tree
Hide file tree
Showing 68 changed files with 3,656 additions and 1,876 deletions.
8 changes: 4 additions & 4 deletions Build-SFPkgs.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ function Build-SFPkg {
try {
Push-Location $scriptPath

Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.2.0.831" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.2.0.831" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.SelfContained.3.2.1.831" "$scriptPath\bin\release\FabricObserver\linux-x64\self-contained\FabricObserverType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Linux.FrameworkDependent.3.2.1.831" "$scriptPath\bin\release\FabricObserver\linux-x64\framework-dependent\FabricObserverType"

Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.2.0.831" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.2.0.831" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.2.1.831" "$scriptPath\bin\release\FabricObserver\win-x64\self-contained\FabricObserverType"
Build-SFPkg "Microsoft.ServiceFabricApps.FabricObserver.Windows.FrameworkDependent.3.2.1.831" "$scriptPath\bin\release\FabricObserver\win-x64\framework-dependent\FabricObserverType"
}
finally {
Pop-Location
Expand Down
18 changes: 9 additions & 9 deletions ClusterObserver/ClusterObserver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,14 @@ public override async Task ObserveAsync(CancellationToken token)
return;
}

await ReportAsync(token).ConfigureAwait(false);
await ReportAsync(token);

LastRunDateTime = DateTime.Now;
}

public override async Task ReportAsync(CancellationToken token)
{
await ReportClusterHealthAsync(token).ConfigureAwait(false);
await ReportClusterHealthAsync(token);
}

/// <summary>
Expand Down Expand Up @@ -146,12 +146,12 @@ private async Task ReportClusterHealthAsync(CancellationToken token)
try
{
// Monitor node status.
await MonitorNodeStatusAsync(token, ignoreDefaultQueryTimeout).ConfigureAwait(false);
await MonitorNodeStatusAsync(token, ignoreDefaultQueryTimeout);

// Check for active repairs in the cluster.
if (MonitorRepairJobStatus)
{
var repairsInProgress = await GetRepairTasksCurrentlyProcessingAsync(token).ConfigureAwait(false);
var repairsInProgress = await GetRepairTasksCurrentlyProcessingAsync(token);
string repairState = string.Empty;

if (repairsInProgress?.Count > 0)
Expand Down Expand Up @@ -262,7 +262,7 @@ await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
case HealthEvaluationKind.Nodes:
try
{
await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates, token).ConfigureAwait(false);
await ProcessNodeHealthAsync(clusterHealth.NodeHealthStates, token);
}
catch (Exception e) when (e is FabricException || e is TimeoutException)
{
Expand Down Expand Up @@ -320,7 +320,7 @@ await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(

try
{
await ProcessGenericEntityHealthAsync(evaluation, token).ConfigureAwait(false);
await ProcessGenericEntityHealthAsync(evaluation, token);
}
catch (Exception e) when (e is FabricException || e is TimeoutException)
{
Expand Down Expand Up @@ -812,7 +812,7 @@ private async Task MonitorNodeStatusAsync(CancellationToken token, bool isTest =
null,
isTest ? TimeSpan.FromSeconds(1) : ConfigurationSettings.AsyncTimeout,
token),
token).ConfigureAwait(false);
token);

// Are any of the nodes that were previously in non-Up status, now Up?
if (NodeStatusDictionary.Count > 0)
Expand Down Expand Up @@ -949,7 +949,7 @@ private async Task<bool> IsRepairManagerDeployedAsync(CancellationToken cancella
repairManagerServiceUri,
ignoreDefaultQueryTimeout ? TimeSpan.FromSeconds(1) : ConfigurationSettings.AsyncTimeout,
cancellationToken),
cancellationToken).ConfigureAwait(false);
cancellationToken);

return serviceList?.Count > 0;
}
Expand Down Expand Up @@ -986,7 +986,7 @@ private async Task<RepairTaskList> GetRepairTasksCurrentlyProcessingAsync(Cancel
null,
ignoreDefaultQueryTimeout ? TimeSpan.FromSeconds(1) : ConfigurationSettings.AsyncTimeout,
cancellationToken),
cancellationToken).ConfigureAwait(false);
cancellationToken);

return repairTasks;
}
Expand Down
7 changes: 2 additions & 5 deletions ClusterObserver/ClusterObserver.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
<RootNamespace>ClusterObserver</RootNamespace>
<AssemblyName>ClusterObserver</AssemblyName>
<TargetFramework>netcoreapp3.1</TargetFramework>
<PlatformTarget>x64</PlatformTarget>
<Copyright>Copyright © 2022</Copyright>
<Product>ClusterObserver</Product>
<Version>2.2.0.831</Version>
Expand All @@ -19,7 +18,7 @@
<AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
<GenerateAssemblyInfo>false</GenerateAssemblyInfo>
<StartupObject>ClusterObserver.Program</StartupObject>
<Platforms>AnyCPU;x64</Platforms>
<Platforms>x64</Platforms>
</PropertyGroup>
<ItemGroup>
<Compile Remove="Utilities\ClusterIdentificationUtility.cs" />
Expand All @@ -28,14 +27,12 @@
<None Remove="ApplicationInsights.config" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Extensions.DependencyInjection" Version="5.0.2" />
<PackageReference Include="Microsoft.Extensions.DependencyInjection.Abstractions" Version="5.0.0" />
<PackageReference Include="Microsoft.ApplicationInsights" Version="2.20.0" />
<PackageReference Include="Microsoft.AspNet.WebApi.Client" Version="5.2.9" />
<PackageReference Include="Microsoft.ServiceFabric.Services" Version="5.0.516" />
<PackageReference Include="Microsoft.Win32.Registry" Version="5.0.0" />
<PackageReference Include="Newtonsoft.Json" Version="13.0.1" />
<PackageReference Include="Octokit" Version="0.51.0" />
<PackageReference Include="Octokit" Version="1.0.0" />
<PackageReference Include="McMaster.NETCore.Plugins" Version="1.4.0" />
</ItemGroup>
<ItemGroup>
Expand Down
2 changes: 1 addition & 1 deletion Documentation/Deployment/Deploy-FabricObserver.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Try {

$resourceGroup = "<YOUR-CLUSTER-RESOURCE-NAME>"
$armTemplate = "service-fabric-observer.json"
$armTemplateParameters = "service-fabric-observer.v3.2.0.831.parameters.json"
$armTemplateParameters = "service-fabric-observer.v3.2.1.831.parameters.json"

cd "<LOCAL-FO-REPO-PATH>\Documentation\Deployment"

Expand Down
6 changes: 3 additions & 3 deletions Documentation/Deployment/service-fabric-observer.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@
},
"applicationTypeVersionFabricObserver": {
"type": "string",
"defaultValue": "3.2.0.831",
"defaultValue": "3.2.1.831",
"metadata": {
"description": "Provide the app version number of FabricObserver. This must be identical to the version, 3.2.0.831, in the referenced sfpkg specified in packageUrlFabricObserver."
"description": "Provide the app version number of FabricObserver. This must be identical to the version, 3.2.1.831, in the referenced sfpkg specified in packageUrlFabricObserver."
}
},
"packageUrlFabricObserver": {
"type": "string",
"defaultValue": "https://github.com/microsoft/service-fabric-observer/releases/download/57635490/Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.2.0.831.sfpkg",
"metadata": {
"description": "This has to be a public accessible URL for the sfpkg file which contains the FabricObserver app package. Example: https://github.com/microsoft/service-fabric-observer/releases/download/57635490/Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.2.0.831.sfpkg"
"description": "This has to be a public accessible URL for the sfpkg file which contains the FabricObserver app package. Example: https://github.com/microsoft/service-fabric-observer/releases/download/[xxxxxxxx]/Microsoft.ServiceFabricApps.FabricObserver.Windows.SelfContained.3.2.1.831.sfpkg"
}
}
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"value": "<YOUR-CLUSTER-RESOURCE-NAME>"
},
"applicationTypeVersionFabricObserver": {
"value": "3.2.0.831"
"value": "3.2.1.831"
},
"packageUrlFabricObserver": {
"value": "<PUBLIC-ACCESSIBLE-URL-FOR-FABRICOBSERVER-SFPKG>"
Expand Down
87 changes: 87 additions & 0 deletions Documentation/ETW.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
## FabricObserver ETW Support

FabricObserver employs EventSource events for ETW. There are two key pieces to this support. This is a feature typically used by internal Microsoft teams. Generally, you should disable this feature unless
you have a way to push EventSource ETW to Azure Kusto.

- FabricObserverETWProvider is the default name of the EventSource provider. You can customize this name by changing
the value of the Application parameter ObserverManagerETWProviderName. Doing so is unnecessary unless you have a more advanced scenario (like multiple instance of FO running on the same node).
- FabricObserverDataEvent is the name of each EventSource event that FabricObserver emits.

You have to enable ETW for each observer that you want to receive ETW from. You do this in ApplicationManifest.xml:

```XML
<!-- ETW - Custom EventSource Tracing -->
<Parameter Name="AppObserverEnableEtw" DefaultValue="true" />
<Parameter Name="AzureStorageUploadObserverEnableEtw" DefaultValue="false" />
<Parameter Name="CertificateObserverEnableEtw" DefaultValue="false" />
<Parameter Name="ContainerObserverEnableEtw" DefaultValue="false" />
<Parameter Name="DiskObserverEnableEtw" DefaultValue="false" />
<Parameter Name="FabricSystemObserverEnableEtw" DefaultValue="false" />
<Parameter Name="NetworkObserverEnableEtw" DefaultValue="false" />
<Parameter Name="NodeObserverEnableEtw" DefaultValue="false" />
<Parameter Name="OSObserverEnableEtw" DefaultValue="false" />
<Parameter Name="SFConfigurationObserverEnableEtw" DefaultValue="false" />
```

By default, ObserverManager's EnableETWProvider setting (also located in ApplicationManifest.xml) is enabled. If you disable this, then no ETW will be generated regardless of the Observer-specific settings you provide. Note that AppObserver is enabled to emit ETW events by default for historical reasons. As mentioned above, unless this feature is useful to you, disable it.

Let's take a look at an example of an event that is ingested into the FabricObserverDataEvent Kusto table.

``` JSON
"Message": data="{"ApplicationName":"fabric:/SomeApplication","ApplicationType":"ResourceCentralType","Code":null,"ContainerId":null,"ClusterId":"undefined","Description":null,"EntityType":2,"HealthState":0,"Metric":"Active Ephemeral Ports","NodeName":"MW2PPF7D8279821","NodeType":"AZSM","ObserverName":"AppObserver","OS":"Windows","PartitionId":"a56a62d7-69fd-4f5f-a5fb-caf8b84b537f","ProcessId":24564,"ProcessName":"SomeService","Property":null,"ProcessStartTime":"2022-08-18T15:45:27.2901800Z","ReplicaId":133053111176036935,"ReplicaRole":1,"ServiceKind":1,"ServiceName":"fabric:/SomeApplication/SomeService","ServicePackageActivationMode":0,"Source":"AppObserver","Value":133.0}"
```
As you can see, this is not correctly formatted Json. Note the data="" value.

data is a serialized instance of TelemetryData type in this case, which holds the information that AppObserver (in this case) detected for a service named fabric:/SomeApplication/SomeService for the resource metric Active Ephemeral Ports. Included in the data is everything you need to know about the service like ReplicaId, PartitionId, NodeName, Metric, Value, ProcessName, ProcessId, ProcessStartTime, etc..

In order to parse out the Json-serialized instance of some supported FO data type from the Payload (Message, in the above example), you need to reformat the Message string into well-structured Json:

```KQL
// TelemetryData type. Json is a single object representation.
// data is the payload name. You must remove it.
FabricObserverDataEvent
| where PreciseTimeStamp >= ago(1h) and Tenant == "uswest2-test-42"
// Look for payload that is a Json object.
| where Message startswith "data=\"{"
// remove opening payload name and opening quote
| extend reData = replace_string(Message, "data=\"", "")
// remove closing quote
| extend reData = replace_string(reData, "}\"", "}")
// Now, we have Json..
| extend data = parse_json(reData)
| extend AppName = data.ApplicationName, ServiceName = data.ServiceName, Metric = data.Metric, Result = data.Value, ReplicaId = data.ReplicaId, PartitionId = data.PartitionId,
ProcessId = data.ProcessId, ProcessName = data.ProcessName, ProcessStartTime = data.ProcessStartTime, ServicePackageActivationMode = data.ServicePackageActivationMode, ReplicaRole = data.ReplicaRole,
ServiceKind = data.ServiceKind, Observer = data.ObserverName, NodeName = data.NodeName, NodeType = data.NodeType, Property = data.Property
//| where Observer == "NodeObserver"
//| where Metric == "CPU Time (Percent)" and Result > 0
| project PreciseTimeStamp, ServiceName, NodeName, NodeType, Metric, Result, ReplicaId, PartitionId, ProcessId, ProcessName, ProcessStartTime, ReplicaRole, Observer, Property
| sort by PreciseTimeStamp desc;
// ChildProcessTelemetryData/ChildProcessInfo are Json arrays.
FabricObserverDataEvent
| where PreciseTimeStamp >= ago(1h) and Tenant == "uswest2-test-42"
// Look for payload that is a Json array.
| where Message startswith "data=\"["
| extend reData = replace_string(Message, "data=\"", "")
| extend reData = replace_string(reData, "]\"", "]")
| extend data = parse_json(reData)
// data is a collection (array). Expand into rows.
| mv-expand data
| extend ServiceName = data.ServiceName
| extend Metric = data.Metric
// Parent + child processes culmulative usage value for some metric
| extend CulmulativeValue = data.Value
| extend ChildProcInfo = data.ChildProcessInfo
| project PreciseTimeStamp, ServiceName, Metric, CulmulativeValue, ChildProcInfo
| sort by PreciseTimeStamp desc
```
For information events like above (raw metrics), HealthState is always 0 (Invalid). When some metric crosses the line for a threshold you supplied, HealthState will be 2 (Warning) or 3 (Error), depending upon your related threshold configuration settings.
FO emits more than Json-serialized TelemetryData ETW events. It also emits Json-serialized ChildProcessTelemetryData events (see above), MachineTelemetryData events (OSObserver emits these), and anonymously typed events (Json-serialized anonymous data type which is typically something like an informational or warning event from some observer or ObserverManager that is not a custom FO data type (class) related resource usage monitoring).

### API

For [observer plugin](Plugins.md) authors, you can use your own event name and generate your own ETW using ```LogEtw<T>(string eventName, T data)``` which is a member of ObserverBase.ObserverLogger:

```C#
ObserverLogger.LogEtw("MyEventName", myObj);
```
7 changes: 4 additions & 3 deletions Documentation/Observers.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

Observers are low-impact, long-lived objects that perform specialied monitoring and reporting activities. Observers monitor and report, but they aren't designed to take action. Observers generally monitor appliations through their side effects on the node, like resource usage, but do not actually communicate with the applications. Observers report to SF Event Store (viewable through SFX) in warning and error states, and can use built-in AppInsights support to report there as well.

### Note: All of the observers that collect resource usage data can also emit telemetry: EventSource ETW and either LogAnalytics or ApplicationInsights diagnostic service calls.
### Note: All of the observers that collect resource usage data can also emit telemetry: [EventSource ETW](ETW.md) and either LogAnalytics or ApplicationInsights diagnostic service calls.

> AppInsights or LogAnalytics telemetry can be enabled in `Settings.xml` by providing your related authorization/identity information (keys).
> AppInsights or LogAnalytics telemetry can be configured in `Settings.xml` by providing your related authorization/identity information (keys). You must enable ObserverManagerEnableTelemetryProvider app parameter in AppplicationManifest.xml, which you can also enable/disable with versionless
> parameter-only application upgrades.
### Logging

Expand Down Expand Up @@ -547,7 +548,7 @@ By default, FabricObserver runs as NetworkUser on Windows and sfappsuser on Linu
running as System or root, default FabricObserver can't monitor process behavior (this is always true on Windows). That said, there are only a few system
services you would care about: Fabric.exe and FabricGateway.exe. Fabric.exe is generally the system service that your code can directly impact with respect to machine resource usage.

**NOTE: Version 3.2.1 removes support for concurrent service process monitoring and reporting by FabricSystemObserver**. This feature is not worth the resource overhead given the limited number of processes FSO monitors.
**NOTE: Version 3.2.1.960 removes support for concurrent service process monitoring and reporting by FabricSystemObserver**. This feature is not worth the resource overhead given the limited number of processes FSO monitors.


**Input - Settings.xml**: Only ClusterOperationTimeoutSeconds is set in Settings.xml.
Expand Down
4 changes: 2 additions & 2 deletions Documentation/OperationalTelemetry.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ Here is a full example of exactly what is sent in one of these telemetry events,
"ClusterId": "00000000-1111-1111-0000-00f00d000d",
"ClusterType": "SFRP",
"NodeNameHash": "3e83569d4c6aad78083cd081215dafc81e5218556b6a46cb8dd2b183ed0095ad",
"FOVersion": "3.2.0.831",
"FOVersion": "3.2.1.831",
"HasPlugins": "False",
"ParallelCapable": "True",
"SFRuntimeVersion":"8.2.1363.9590"
"UpTime": "1.00:30:18.8058379",
"Timestamp": "2022-07-12T02:45:28.9827940Z",
"Timestamp": "2022-07-14T02:45:28.9827940Z",
"OS": "Windows",
"EnabledObserverCount": 5,
"AppObserverTotalMonitoredApps": 5,
Expand Down
Loading

0 comments on commit 723e318

Please sign in to comment.