diff --git a/CODE-OF-CONDUCT.md b/CODE-OF-CONDUCT.md index ed820eeb3..c622543c8 100644 --- a/CODE-OF-CONDUCT.md +++ b/CODE-OF-CONDUCT.md @@ -1,3 +1,3 @@ -## The NRI Plugin Collection Project Community Code of Conduct +# The NRI Plugins Project Community Code of Conduct -The NRI Plugin Collection Project follows the [Containers Community Code of Conduct](https://github.com/containers/common/blob/main/CODE-OF-CONDUCT.md). +The NRI Plugins Project follows the [Containers Community Code of Conduct](https://github.com/containers/common/blob/main/CODE-OF-CONDUCT.md). diff --git a/Makefile b/Makefile index ec1879ba5..9dee84432 100644 --- a/Makefile +++ b/Makefile @@ -91,6 +91,17 @@ LDFLAGS = \ -X=github.com/containers/nri-plugins/pkg/version.Build=$(BUILD_BUILDID) \ -B 0x$(RANDOM_ID)" +# Documentation-related variables +SPHINXOPTS ?= -W +SPHINXBUILD = sphinx-build +SITE_BUILDDIR ?= build/docs + +# Docker base command for working with html documentation. +DOCKER_SITE_BUILDER_IMAGE := nri-plugins-site-builder +DOCKER_SITE_CMD := $(DOCKER) run --rm -v "`pwd`:/docs" --user=`id -u`:`id -g` \ + -p 8081:8081 \ + -e SITE_BUILDDIR=$(SITE_BUILDDIR) -e SPHINXOPTS=$(SPHINXOPTS) + # # top-level targets # @@ -329,3 +340,33 @@ report-licenses: --ignore github.com/containers/nri-plugins \ > $(LICENSE_PATH)/licenses.csv && \ echo See $(LICENSE_PATH)/licenses.csv for license information + +# +# Rules for documentation +# + +html: clean-html + $(Q)BUILD_VERSION=$(BUILD_VERSION) \ + $(SPHINXBUILD) -c docs . "$(SITE_BUILDDIR)" $(SPHINXOPTS) + cp docs/index.html "$(SITE_BUILDDIR)" + for d in $$(find docs -name figures -type d); do \ + mkdir -p $(SITE_BUILDDIR)/$$d && cp $$d/* $(SITE_BUILDDIR)/$$d; \ + done + +serve-html: html + $(Q)cd $(SITE_BUILDDIR) && python3 -m http.server 8081 + +clean-html: + rm -rf $(SITE_BUILDDIR) + +site-build: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp + $(Q)$(DOCKER_SITE_CMD) $(DOCKER_SITE_BUILDER_IMAGE) make html + +site-serve: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp + $(Q)$(DOCKER_SITE_CMD) -it $(DOCKER_SITE_BUILDER_IMAGE) make serve-html + +.$(DOCKER_SITE_BUILDER_IMAGE).image.stamp: docs/Dockerfile docs/requirements.txt + docker build -t $(DOCKER_SITE_BUILDER_IMAGE) docs + touch $@ + +docs: site-build diff --git a/README.md b/README.md index 2a048c0a3..ef6c74454 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# NRI Plugin Collection +# NRI Plugins This repository contains a collection of community maintained NRI plugins. @@ -9,4 +9,4 @@ Currently following plugins are available: | [Topology Aware][1] | resource policy | | [Balloons][1] | resource policy | -[1]: http://github.com/containers/nri-plugins/blob/main/docs/README-resource-policy.md +[1]: http://github.com/containers/nri-plugins/blob/main/docs/resource-policy/README.md diff --git a/SECURITY.md b/SECURITY.md index 6d7c62b19..8b709c43c 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,4 +1,4 @@ -# Security and Disclosure Information Policy for the NRI Plugin Collection Project +# Security and Disclosure Information Policy for the NRI Plugins Project * [Reporting a Vulnerability](#Reporting-a-Vulnerability) * [Security Announcements](#Security-Announcements) @@ -6,7 +6,7 @@ ## Reporting a Vulnerability -If you think you've identified a security issue in a NRI Plugin Collection project, +If you think you've identified a security issue in a NRI Plugins project, please DO NOT report the issue publicly via the Github issue tracker, mailing list, or IRC. Instead, send an email with as many details as possible to [cncf-crio-security@lists.cncf.io](mailto:cncf-crio-security@lists.cncf.io?subject=Security%20Vunerablity%20Report) or [security@containerd.io](mailto:security@containerd.io?subject=Security%20Vunerablity%20Report). diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 000000000..bc702fdcd --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,13 @@ +FROM sphinxdoc/sphinx:5.3.0 + +RUN apt-get update && apt-get install -y wget git + +# Note: Any golang version that can 'go list -m -f {{.Variable}}' is fine... +RUN wget https://go.dev/dl/go1.20.4.linux-amd64.tar.gz && \ + tar -C /usr/local -xzf go1.20.4.linux-amd64.tar.gz + +ENV PATH=$PATH:/usr/local/go/bin + +COPY requirements.txt . + +RUN pip3 install -r requirements.txt diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html new file mode 100644 index 000000000..0e68b46ac --- /dev/null +++ b/docs/_templates/layout.html @@ -0,0 +1,35 @@ +{%- extends "!layout.html" %} + +{% block footer %} + {% if versions_menu %} +
+ + GitHub Pages + {{ versions_menu_this_version }} + + +
+
+
{{ _('Versions') }}
+
+
+
+ all releases +
+
+
+
+ {% endif %} + + +{% endblock %} diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 000000000..3ba21d146 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,282 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +from docutils import nodes +from os.path import isdir, isfile, join, basename, dirname +from os import makedirs, getenv +from shutil import copyfile +from subprocess import run, STDOUT + +# -- Project information ----------------------------------------------------- + +project = 'NRI Plugins' +copyright = '2023, various' +author = 'various' + +master_doc = 'docs/index' + + +############################################################################## +# +# This section determines the behavior of links to local items in .md files. +# +# if useGitHubURL == True: +# +# links to local files and directories will be turned into github URLs +# using either the baseBranch defined here or using the commit SHA. +# +# if useGitHubURL == False: +# +# local files will be moved to the website directory structure when built +# local directories will still be links to github URLs +# +# if built with GitHub workflows: +# +# the GitHub URLs will use the commit SHA (GITHUB_SHA environment variable +# is defined by GitHub workflows) to link to the specific commit. +# +############################################################################## + +baseBranch = "main" +useGitHubURL = True +commitSHA = getenv('GITHUB_SHA') +githubServerURL = getenv('GITHUB_SERVER_URL') +githubRepository = getenv('GITHUB_REPOSITORY') +if githubServerURL and githubRepository: + githubBaseURL = join(githubServerURL, githubRepository) +else: + githubBaseURL = "https://github.com/containers/nri-plugins/" + +githubFileURL = join(githubBaseURL, "blob/") +githubDirURL = join(githubBaseURL, "tree/") +if commitSHA: + githubFileURL = join(githubFileURL, commitSHA) + githubDirURL = join(githubDirURL, commitSHA) +else: + githubFileURL = join(githubFileURL, baseBranch) + githubDirURL = join(githubDirURL, baseBranch) + +# Version displayed in the upper left corner of the site +ref = getenv('GITHUB_REF', default="") +if ref == "refs/heads/main": + version = "devel" +elif ref.startswith("refs/heads/release-"): + # For release branches just show the latest tag name + buildVersion = getenv("BUILD_VERSION", default="unknown") + version = buildVersion.split('-')[0] +elif ref.startswith("refs/tags/"): + version = ref[len("refs/tags/"):] +else: + version = getenv("BUILD_VERSION", default="unknown") + +release = getenv("BUILD_VERSION", default="unknown") + +# Versions to show in the version menu +if getenv('VERSIONS_MENU'): + html_context = { + 'versions_menu': True, + 'versions_menu_this_version': getenv('VERSIONS_MENU_THIS_VERSION', version)} + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ['myst_parser', 'sphinx_markdown_tables'] +myst_enable_extensions = ['substitution'] +source_suffix = {'.rst': 'restructuredtext','.md': 'markdown'} + +# Substitution variables +def module_version(module, version): + version=version.split('-', 1)[0] + if module == 'github.com/intel/goresctrl': + version = '.'.join(version.split('.')[0:2]) + '.0' + return version + +def gomod_versions(modules): + versions = {} + gocmd = run(['go', 'list', '-m', '-f', '{{.GoVersion}}'], + check=True, capture_output=True, universal_newlines=True) + versions['golang'] = gocmd.stdout.strip() + for m in modules: + gocmd = run(['go', 'list', '-m', '-f', '{{.Version}}', '%s' % m], + check=True, capture_output=True, universal_newlines=True) + versions[m] = module_version(m, gocmd.stdout.strip()) + return versions + +mod_versions = gomod_versions(['github.com/intel/goresctrl']) +myst_substitutions = { + 'golang_version': mod_versions['golang'], + 'goresctrl_version': mod_versions['github.com/intel/goresctrl'] +} +myst_heading_anchors = 3 + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', '.github', '_work', 'generate', 'README.md', 'TODO.md', 'SECURITY.md', 'CODE-OF-CONDUCT.md', 'docs/releases', 'test/self-hosted-runner/README.md', 'test/e2e/README.md', 'docs/resource-policy/releases', 'docs/resource-policy/README.md','test/statistics-analysis/README.md'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +html_theme_options = { + 'display_version': True, +} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +#html_static_path = ['_static'] + +def setup(app): + app.connect('doctree-resolved',fixLocalMDAnchors) + app.connect('missing-reference',fixRSTLinkInMD) + +############################################################################### +# +# This section defines callbacks that make markdown specific tweaks to +# either: +# +# 1. Fix something that recommonmark does wrong. +# 2. Provide support for .md files that are written as READMEs in a GitHub +# repo. +# +# Only use these changes if using the extension ``recommonmark``. +# +############################################################################### + +def isHTTPLink(uri): + return uri.startswith('http://') or uri.startswith('https://') + +def isMDFileLink(uri): + return uri.endswith('.md') or '.md#' in uri + +def isRSTFileLink(uri): + return uri.endswith('.rst') + +# Callback registerd with 'missing-reference'. +def fixRSTLinkInMD(app, env, node, contnode): + refTarget = node.get('reftarget') + + if isHTTPLink(refTarget): + return + + if isRSTFileLink(refTarget) and not isHTTPLink(refTarget): + # This occurs when a .rst file is referenced from a .md file + # Currently unable to check if file exists as no file + # context is provided and links are relative. + # + # Example: [Application examples](examples/readme.rst) + # + contnode['refuri'] = contnode['refuri'].replace('.rst','.html') + contnode['internal'] = "True" + return contnode + elif refTarget.startswith("/"): + # This occurs when a file is referenced for download from an .md file. + # Construct a list of them and short-circuit the warning. The files + # are moved later (need file location context). To avoid warnings, + # write .md files, make the links absolute. This only marks them fixed + # if it can verify that they exist. + # + # Example: [Makefile](/Makefile) + # + filePath = refTarget.lstrip("/") + if isfile(filePath) or isdir(filePath): + return contnode + + +def normalizePath(docPath,uriPath): + if uriPath == "": + return uriPath + if "#" in uriPath: + # Strip out anchors + uriPath = uriPath.split("#")[0] + if uriPath.startswith("/"): + # It's an absolute path + return uriPath.lstrip("/") #path to file from project directory + else: + # It's a relative path + docDir = dirname(docPath) + return join(docDir,uriPath) #path to file from referencing file + + +# Callback registerd with 'doctree-resolved'. +def fixLocalMDAnchors(app, doctree, docname): + for node in doctree.traverse(nodes.reference): + uri = node.get('refuri') + + if isHTTPLink(uri): + continue + + filePath = normalizePath(docname,uri) + + if isfile(filePath): + # Only do this if the file exists. + # + # TODO: Pop a warning if the file doesn't exist. + # + if isMDFileLink(uri) and not isHTTPLink(uri): + # Make sure .md file links that weren't caught are converted. + # These occur when creating an explicit link to an .md file + # from an .rst file. By default these are not validated by Sphinx + # or recommonmark. Only toctree references are validated. recommonmark + # also fails to convert links to local Markdown files that include + # anchors. This fixes that as well. + # + # Only include this code if .md files are being converted to html + # + # Example: `Google Cloud Engine `__ + # [configuration options](autotest.md#configuration-options) + # + node['refuri'] = node['refuri'].replace('.md','.html') + else: + # Handle the case where markdown is referencing local files in the repo + # + # Example: [Makefile](/Makefile) + # + if useGitHubURL: + # Replace references to local files with links to the GitHub repo + # + newURI = join(githubFileURL, filePath) + print("new url: ", newURI) + node['refuri']=newURI + else: + # If there are links to local files other than .md (.rst files are caught + # when warnings are fired), move the files into the Sphinx project, so + # they can be accessed. + newFileDir = join(app.outdir,dirname(filePath)) # where to move the file in Sphinx output. + newFilePath = join(app.outdir,filePath) + newURI = uri # if the path is relative no need to change it. + if uri.startswith("/"): + # It's an absolute path. Need to make it relative. + uri = uri.lstrip("/") + docDirDepth = len(docname.split("/")) - 1 + newURI = "../"*docDirDepth + uri + if not isdir(newFileDir): + makedirs(newFileDir) + copyfile(filePath,newFilePath) + node['refuri'] = newURI + elif "#" not in uri: # ignore anchors + # turn links to directories into links to the repo + if isdir(filePath): + newURI = join(githubDirURL, filePath) + node['refuri']=newURI diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 000000000..708eb515f --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,10 @@ +# Contributing + +Please use the GitHub\* infrastructure for contributing to NRI Plugins. +Use [pull requests](https://github.com/containers/nri-plugins/pulls) +to contribute code, bug fixes, or if you want to discuss your ideas in terms of +code. Open [issues](https://github.com/containers/nri-plugins/issues) to +report bugs, request new features, or if you want to discuss any other topics +related to NRI plugins. + +For the actual NRI (Node Resource Interface) API, please see [NRI repository](https://github.com/containerd/nri) diff --git a/docs/index.html b/docs/index.html new file mode 100644 index 000000000..50d6988fa --- /dev/null +++ b/docs/index.html @@ -0,0 +1 @@ + diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 000000000..4fcefd194 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,14 @@ +.. NRI Plugins documentation master file + +Welcome to NRI Plugins documentation +==================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + resource-policy/index.rst + + contributing.md + + Project GitHub repository diff --git a/docs/releases/conf.py b/docs/releases/conf.py new file mode 100644 index 000000000..ee64ee7ad --- /dev/null +++ b/docs/releases/conf.py @@ -0,0 +1,76 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'NRI Plugins' +copyright = '2023, various' +author = 'various' + +# Versions to show in the version menu +version = "all releases" +if os.getenv('VERSIONS_MENU'): + html_context = { + 'versions_menu': True, + 'versions_menu_this_version': version} + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'myst_parser', + 'sphinx_markdown_tables' + ] +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown' + } + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['../_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +html_theme_options = { + 'display_version': True, +} + + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +#html_static_path = ['_static'] + +# Callbacks for recommonmark +def setup(app): + app.connect('missing-reference',ignoreMissingRefs) + +def ignoreMissingRefs(app, env, node, contnode): + return contnode diff --git a/docs/releases/index.md b/docs/releases/index.md new file mode 100644 index 000000000..edf183c77 --- /dev/null +++ b/docs/releases/index.md @@ -0,0 +1,21 @@ +# Releases + +For up-to-date user documentation see the [documentation site](/nri-plugins/resource-policy) + +## Documentation for Released Versions +
+
+ + diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 000000000..329afe4b4 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,5 @@ +sphinx==5.3.0 +sphinx_rtd_theme +myst-parser==0.18.1 +sphinx-markdown-tables +Pygments==2.13.0 diff --git a/docs/README-resource-policy.md b/docs/resource-policy/README.md similarity index 100% rename from docs/README-resource-policy.md rename to docs/resource-policy/README.md diff --git a/docs/resource-policy/developers-guide/architecture.md b/docs/resource-policy/developers-guide/architecture.md new file mode 100644 index 000000000..130df98e5 --- /dev/null +++ b/docs/resource-policy/developers-guide/architecture.md @@ -0,0 +1,205 @@ +# Architecture + +## Overview + +NRI Resource Policy (later NRI-RP) plugin is an add-on for controlling +container resource allocation on Kubernetes nodes. + +NRI-RP plugs in to the NRI interface provided by container runtime implementation. +The NRI-RP may alter the container resource allocation depending on +configuration. + +NRI-RP keeps track of the states of all containers running on a Kubernetes +node. Whenever it receives a NRI request that results in changes to the +resource allocation of any container (container creation, deletion, or +resource assignment update request), NRI-RP runs the built-in policy +algorithm. This policy makes a decision about how the assignment of +resources should be updated. The policy can make changes to any +container in the system, not just the one associated with the received +NRI request. NRI-RP's internal state tracking cache provides an abstraction +for modifying containers and the policy uses this abstraction for recording its +decisions. + +Many aspects for NRI-RP are configurable. These include, for instance, +configuration of the resource assignment algorithm for the policy. + +Although NRI-RP can be configured using a static configuration file, +the preferred way to configure all NRI-RP instances in a cluster is to use +Kubernetes CRDs and ConfigMaps. + +

+ + +

+ +## Components + +### [Node Agent](/pkg/resmgr/agent/) + +The node agent is a component internal to NRI-RP itself. All interactions +by NRI-RP with the Kubernetes Control Plane go through the node agent with +the node agent performing any direct interactions on behalf of NRI-RP. + +The agent interface implements the following functionality: + - push updated external configuration data to NRI-RP + - updating resource capacity of the node + - getting, setting, or removing labels on the node + - getting, setting, or removing annotations on the node + - getting, setting, or removing taints on the node + +The config interface is defined and has its gRPC server running in +NRI-RP. The agent acts as a gRPC client for this interface. The low-level +cluster interface is defined and has its gRPC server running in the agent, +with the [convenience layer](/pkg/resmgr/agent) defined in NRI-RP. +NRI-RP acts as a gRPC client for the low-level plumbing interface. + +Additionally, the stock node agent that comes with NRI-RP implements schemes +for: + - configuration management for all NRI-RP instances + - management of dynamic adjustments to container resource assignments + + +### [Resource Manager](/pkg/resmgr/) + +NRI-RP implements an event processing pipeline. In addition to NRI events, +it processes a set of other events that are not directly related to or the +result of NRI requests. +These events are typically internally generated within NRI-RP. They can be +the result of changes in the state of some containers or the utilization +of a shared system resource, which potentially could warrant an attempt to +rebalance the distribution of resources among containers to bring the system +closer to an optimal state. Some events can also be generated by policies. + +The Resource Manager component of NRI-RP implements the basic control +flow of the processing pipeline. It passes control to all the +necessary sub-components of NRI-RP at the various phases of processing a +request or an event. Additionally, it serializes the processing of these, +making sure there is at most one request or event being processed at any +point in time. + +The high-level control flow of the request processing pipeline is as +follows: + +A. If the request does not need policying, let it bypass the processing +pipeline; hand it off for logging, then relay it to the server and the +corresponding response back to the client. + +B. If the request needs to be intercepted for policying, do the following: + 1. Lock the processing pipeline serialization lock. + 2. Look up/create cache objects (pod/container) for the request. + 3. If the request has no resource allocation consequences, do proxying + (step 6). + 4. Otherwise, invoke the policy layer for resource allocation: + - Pass it on to the configured active policy, which will + - Allocate resources for the container. + - Update the assignments for the container in the cache. + - Update any other containers affected by the allocation in the cache. + 5. Invoke the controller layer for post-policy processing, which will: + - Collect controllers with pending changes in their domain of control + - for each invoke the post-policy processing function corresponding to + the request. + - Clear pending markers for the controllers. + 6. Proxy the request: + - Relay the request to the server. + - Send update requests for any additional affected containers. + - Update the cache if/as necessary based on the response. + - Relay the response back to the client. + 7. Release the processing pipeline serialization lock. + +The high-level control flow of the event processing pipeline is one of the +following, based on the event type: + + - For policy-specific events: + 1. Engage the processing pipeline lock. + 2. Call policy event handler. + 3. Invoke the controller layer for post-policy processing (same as step 5 for requests). + 4. Release the pipeline lock. + - For metrics events: + 1. Perform collection/processing/correlation. + 2. Engage the processing pipeline lock. + 3. Update cache objects as/if necessary. + 4. Request rebalancing as/if necessary. + 5. Release pipeline lock. + - For rebalance events: + 1. Engage the processing pipeline lock. + 2. Invoke policy layer for rebalancing. + 3. Invoke the controller layer for post-policy processing (same as step 5 for requests). + 4. Release the pipeline lock. + + +### [Cache](/pkg/resmgr/cache/) + +The cache is a shared internal storage location within NRI-RP. It tracks the +runtime state of pods and containers known to NRI-RP, as well as the state +of NRI-RP itself, including the active configuration and the state of the +active policy. The cache is saved to permanent storage in the filesystem and +is used to restore the runtime state of NRI-RP across restarts. + +The cache provides functions for querying and updating the state of pods and +containers. This is the mechanism used by the active policy to make resource +assignment decisions. The policy simply updates the state of the affected +containers in the cache according to the decisions. + +The cache's ability to associate and track changes to containers with +resource domains is used to enforce policy decisions. The generic controller +layer first queries which containers have pending changes, then invokes each +controller for each container. The controllers use the querying functions +provided by the cache to decide if anything in their resource/control domain +needs to be changed and then act accordingly. + +Access to the cache needs to be serialized. However, this serialization is +not provided by the cache itself. Instead, it assumes callers to make sure +proper protection is in place against concurrent read-write access. The +request and event processing pipelines in the resource manager use a lock to +serialize request and event processing and consequently access to the cache. + +If a policy needs to do processing unsolicited by the resource manager, IOW +processing other than handling the internal policy backend API calls from the +resource manager, then it should inject a policy event into the resource +managers event loop. This causes a callback from the resource manager to +the policy's event handler with the injected event as an argument and with +the cache properly locked. + + +### [Generic Policy Layer](/pkg/resmgr/policy/policy.go) + +The generic policy layer defines the abstract interface the rest of NRI-RP +uses to interact with policy implementations and takes care of the details +of activating and dispatching calls through to the configured active policy. + + +### [Generic Resource Controller Layer](/pkg/resmgr/control/control.go) + +The generic resource controller layer defines the abstract interface the rest +of NRI-RP uses to interact with resource controller implementations and takes +care of the details of dispatching calls to the controller implementations +for post-policy enforcment of decisions. + + +### [Metrics Collector](/pkg/metrics/) + +The metrics collector gathers a set of runtime metrics about the containers +running on the node. NRI-RP can be configured to periodically evaluate this +collected data to determine how optimal the current assignment of container +resources is and to attempt a rebalancing/reallocation if it is deemed +both possible and necessary. + + +### [Policy Implementations](/cmd/) + +#### [Topology Aware](/cmd/topology-aware/) + +A topology-aware policy capable of handling multiple tiers/types of memory, +typically a DRAM/PMEM combination configured in 2-layer memory mode. + +#### [Balloons](/cmd/balloons/) + +A balloons policy allows user to define fine grained control how the +computer resources are distributed to workloads. + +#### [Template](/cmd/template/) + +The template policy can be used as a base for developing new policies. +It provides hooks that the policy developer can fill to define fine grained +control how the computer resources are distributed to workloads. +Do not edit the template policy directly but copy it to new name and edit that. diff --git a/docs/resource-policy/developers-guide/e2e-test.md b/docs/resource-policy/developers-guide/e2e-test.md new file mode 100644 index 000000000..06adb92f5 --- /dev/null +++ b/docs/resource-policy/developers-guide/e2e-test.md @@ -0,0 +1,118 @@ +# End-to-End tests + +## Prerequisites + +Install: +- `docker` +- `vagrant` + +## Usage + +Run policy tests: + +``` +cd test/e2e +[VAR=VALUE...] ./run_tests.sh policies.test-suite +``` + +Run tests only on certain policy, topology, or only selected test: + +``` +cd test/e2e +[VAR=VALUE...] ./run_tests.sh policies.test-suite[/POLICY[/TOPOLOGY[/testNN-*]]] +``` + +Get help on available `VAR=VALUE`'s with `./run.sh help`. +`run_tests.sh` calls `run.sh` in order to execute selected tests. +Therefore the same `VAR=VALUE` definitions apply both scripts. + +## Test phases + +In the *setup phase* `run.sh` creates a virtual machine unless it +already exists. When it is running, tests create a single-node cluster +and deploy `nri-resource-policy` DaemonSet on it. + +In the *test phase* `run.sh` runs a test script. *Test scripts* are +`bash` scripts that can use helper functions for running commands and +observing the status of the virtual machine and software running on it. + +In the *tear down phase* `run.sh` copies logs from the virtual machine +and finally stops or deletes the virtual machine, if that is wanted. + +## Test modes + +- `test` mode runs fast and reports `Test verdict: PASS` or + `FAIL`. The exit status is zero if and only if a test passed. + +Currently only the normal test mode is supported. + +## Running from scratch and quick rerun in existing virtual machine + +The test will use `vagrant`-managed virtual machine named in the +`vm_name` environment variable. The default name is constructed +from used topology, Linux distribution and runtime name. +If a virtual machine already exists, the test will be run on it. +Otherwise the test will create a virtual machine from scratch. +You can delete a virtual machine by going to the VM directory and +giving the command `make destroy`. + +## Custom topologies + +If you change NUMA node topology of an existing virtual machine, you +must delete the virtual machine first. Otherwise the `topology` variable +is ignored and the test will run in the existing NUMA +configuration. + +The `topology` variable is a JSON array of objects. Each object +defines one or more NUMA nodes. Keys in objects: +``` +"mem" mem (RAM) size on each NUMA node in this group. + The default is "0G". +"nvmem" nvmem (non-volatile RAM) size on each NUMA node + in this group. The default is "0G". +"cores" number of CPU cores on each NUMA node in this group. + The default is 0. +"threads" number of threads on each CPU core. + The default is 2. +"nodes" number of NUMA nodes on each die. + The default is 1. +"dies" number of dies on each package. + The default is 1. +"packages" number of packages. + The default is 1. +``` + + +Example: + +Run the test in a VM with two NUMA nodes. There are 4 CPUs (two cores, two +threads per core by default) and 4G RAM in each node +``` +e2e$ vm_name=my2x4 topology='[{"mem":"4G","cores":2,"nodes":2}]' ./run.sh +``` + +Run the test in a VM with 32 CPUs in total: there are two packages +(sockets) in the system, each containing two dies. Each die containing +two NUMA nodes, each node containing 2 CPU cores, each core containing +two threads. And with a NUMA node with 16G of non-volatile memory +(NVRAM) but no CPUs. + +``` +e2e$ vm_name=mynvram topology='[{"mem":"4G","cores":2,"nodes":2,"dies":2,"packages":2},{"nvmem":"16G"}]' ./run.sh +``` + +## Test output + +All test output is saved under the directory in the environment +variable `outdir` if the `run.sh` script is executed as is. The default +output directory in this case is `./output`. + +For the standard e2e-tests run by `run_tests.sh`, the output directory +is constructed from used Linux distribution, container runtime name and +the used machine topology. +For example `n4c16-generic-fedora37-containerd` output directory would +indicate four node and 16 CPU system, running with Fedora 37 and having +containerd as a container runtime. + +Executed commands with their output, exit status and timestamps are +saved under the `output/commands` directory. diff --git a/docs/resource-policy/developers-guide/figures/nri-resource-policy.png b/docs/resource-policy/developers-guide/figures/nri-resource-policy.png new file mode 100644 index 000000000..fdb385c4e Binary files /dev/null and b/docs/resource-policy/developers-guide/figures/nri-resource-policy.png differ diff --git a/docs/resource-policy/developers-guide/index.rst b/docs/resource-policy/developers-guide/index.rst new file mode 100644 index 000000000..dc83815f6 --- /dev/null +++ b/docs/resource-policy/developers-guide/index.rst @@ -0,0 +1,7 @@ +Developer's Guide +################# +.. toctree:: + :maxdepth: 1 + + architecture.md + testing.rst diff --git a/docs/resource-policy/developers-guide/testing.rst b/docs/resource-policy/developers-guide/testing.rst new file mode 100644 index 000000000..3d0a9e30c --- /dev/null +++ b/docs/resource-policy/developers-guide/testing.rst @@ -0,0 +1,8 @@ +Testing +####### + +.. toctree:: + :maxdepth: 1 + + unit-test.md + e2e-test.md diff --git a/docs/resource-policy/developers-guide/unit-test.md b/docs/resource-policy/developers-guide/unit-test.md new file mode 100644 index 000000000..752a11f23 --- /dev/null +++ b/docs/resource-policy/developers-guide/unit-test.md @@ -0,0 +1,7 @@ +# Unit tests + +Run unit tests with +``` +make test +``` + diff --git a/docs/resource-policy/index.rst b/docs/resource-policy/index.rst new file mode 100644 index 000000000..2fcb51f6c --- /dev/null +++ b/docs/resource-policy/index.rst @@ -0,0 +1,16 @@ +.. NRI Resource Policy documentation master file + +Resource Policy Plugin +====================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + introduction.md + installation.md + setup.md + policy/index.rst + node-agent.md + + developers-guide/index.rst diff --git a/docs/resource-policy/installation.md b/docs/resource-policy/installation.md new file mode 100644 index 000000000..25267fe2b --- /dev/null +++ b/docs/resource-policy/installation.md @@ -0,0 +1 @@ +# Installation diff --git a/docs/resource-policy/introduction.md b/docs/resource-policy/introduction.md new file mode 100644 index 000000000..9c5e47260 --- /dev/null +++ b/docs/resource-policy/introduction.md @@ -0,0 +1,12 @@ +# Introduction + +NRI Resource Policy is a NRI container runtime plugin. It is connected +to Container Runtime implementation (containerd, cri-o) via NRI API. +The main purpose of the the NRI resource plugin is to apply hardware-aware +resource allocation policies to the containers running in the system. + +There are different policies available, each with a different set of +goals in mind and implementing different hardware allocation strategies. The +details of whether and how a container resource request is altered or +if extra actions are performed depend on which policy plugin is running +and how that policy is configured. diff --git a/docs/resource-policy/node-agent.md b/docs/resource-policy/node-agent.md new file mode 100644 index 000000000..ea7a57840 --- /dev/null +++ b/docs/resource-policy/node-agent.md @@ -0,0 +1,27 @@ +# Dynamic Configuration + +NRI Resource Policy plugin can be configured dynamically using ConfigMaps. + +The plugin daemon monitors two ConfigMaps for the node, a primary node-specific one +and a secondary group-specific or default one, depending on whether the node +belongs to a configuration group. The node-specific ConfigMap always takes +precedence over the others. + +The names of these ConfigMaps are + +1. `nri-resource-policy-config.node.$NODE_NAME`: primary, node-specific configuration +2. `nri-resource-policy-config.group.$GROUP_NAME`: secondary group-specific node + configuration +3. `nri-resource-policy-config.default`: secondary: secondary default node + configuration + +You can assign a node to a configuration group by setting the +`resource-policy.nri.io/group` label on the node to the name of +the configuration group. You can remove a node from its group by deleting +the node group label. + +There is a +[sample ConfigMap spec](/sample-configs/nri-resource-policy-configmap.example.yaml) +that contains a node-specific, a group-specific, and a default ConfigMap +example. See [any available policy-specific documentation](policy/index.rst) +for more information on the policy configurations. diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md new file mode 100644 index 000000000..f6a9c80da --- /dev/null +++ b/docs/resource-policy/policy/balloons.md @@ -0,0 +1,223 @@ +# Balloons Policy + +## Overview + +The balloons policy implements workload placement into "balloons" that +are disjoint CPU pools. Balloons can be inflated and deflated, that is +CPUs added and removed, based on the CPU resource requests of +containers. Balloons can be static or dynamically created and +destroyed. CPUs in balloons can be configured, for example, by setting +min and max frequencies on CPU cores and uncore. + +## How It Works + +1. User configures balloon types from which the policy instantiates + balloons. + +2. A balloon has a set of CPUs and a set of containers that run on the + CPUs. + +3. Every container is assigned to exactly one balloon. A container is + allowed to use all CPUs of its balloon and no other CPUs. + +4. Every logical CPU belongs to at most one balloon. There can be CPUs + that do not belong to any balloon. + +5. The number of CPUs in a balloon can change during the lifetime of + the balloon. If a balloon inflates, that is CPUs are added to it, + all containers in the balloon are allowed to use more CPUs. If a + balloon deflates, the opposite is true. + +6. When a new container is created on a Kubernetes node, the policy + first decides the type of the balloon that will run the + container. The decision is based on annotations of the pod, or the + namespace if annotations are not given. + +7. Next the policy decides which balloon of the decided type will run + the container. Options are: + - an existing balloon that already has enough CPUs to run its + current and new containers + - an existing balloon that can be inflated to fit its current and + new containers + - new balloon. + +9. When a CPU is added to a balloon or removed from it, the CPU is + reconfigured based on balloon's CPU class attributes, or idle CPU + class attributes. + +## Deployment + +Deploy nri-resource-policy-balloons on each node as you would for any +other policy. See [installation](../installation.md) for more details. + +## Configuration + +The balloons policy is configured using the yaml-based configuration +system of nri-resource-policy. +See [setup and usage](../setup.md#setting-up-nri-resource-policy) for +more details on managing the configuration. + +### Parameters + +Balloons policy parameters: + +- `PinCPU` controls pinning a container to CPUs of its balloon. The + default is `true`: the container cannot use other CPUs. +- `PinMemory` controls pinning a container to the memories that are + closest to the CPUs of its balloon. The default is `true`: allow + using memory only from the closest NUMA nodes. Warning: this may + cause kernel to kill workloads due to out-of-memory error when + closest NUMA nodes do not have enough memory. In this situation + consider switching this option `false`. +- `IdleCPUClass` specifies the CPU class of those CPUs that do not + belong to any balloon. +- `ReservedPoolNamespaces` is a list of namespaces (wildcards allowed) + that are assigned to the special reserved balloon, that is, will run + on reserved CPUs. This always includes the `kube-system` namespace. +- `AllocatorTopologyBalancing` affects selecting CPUs for new + balloons. If `true`, new balloons are created using CPUs on + NUMA/die/package with most free CPUs, that is, balloons are spread + across the hardware topology. This helps inflating balloons within + the same NUMA/die/package and reduces interference between workloads + in balloons when system is not fully loaded. The default is `false`: + pack new balloons tightly into the same NUMAs/dies/packages. This + helps keeping large portions of hardware idle and entering into deep + power saving states. +- `BalloonTypes` is a list of balloon type definitions. Each type can + be configured with the following parameters: + - `Name` of the balloon type. This is used in pod annotations to + assign containers to balloons of this type. + - `Namespaces` is a list of namespaces (wildcards allowed) whose + pods should be assigned to this balloon type, unless overridden by + pod annotations. + - `MinBalloons` is the minimum number of balloons of this type that + is always present, even if the balloons would not have any + containers. The default is 0: if a balloon has no containers, it + can be destroyed. + - `MaxBalloons` is the maximum number of balloons of this type that + is allowed to co-exist. The default is 0: creating new balloons is + not limited by the number of existing balloons. + - `MaxCPUs` specifies the maximum number of CPUs in any balloon of + this type. Balloons will not be inflated larger than this. 0 means + unlimited. + - `MinCPUs` specifies the minimum number of CPUs in any balloon of + this type. When a balloon is created or deflated, it will always + have at least this many CPUs, even if containers in the balloon + request less. + - `CpuClass` specifies the name of the CPU class according to which + CPUs of balloons are configured. + - `PreferSpreadingPods`: if `true`, containers of the same pod + should be spread to different balloons of this type. The default + is `false`: prefer placing containers of the same pod to the same + balloon(s). + - `PreferPerNamespaceBalloon`: if `true`, containers in the same + namespace will be placed in the same balloon(s). On the other + hand, containers in different namespaces are preferrably placed in + different balloons. The default is `false`: namespace has no + effect on choosing the balloon of this type. + - `PreferNewBalloons`: if `true`, prefer creating new balloons over + placing containers to existing balloons. This results in + preferring exclusive CPUs, as long as there are enough free + CPUs. The default is `false`: prefer filling and inflating + existing balloons over creating new ones. + - `ShareIdleCPUsInSame`: Whenever the number of or sizes of balloons + change, idle CPUs (that do not belong to any balloon) are reshared + as extra CPUs to workloads in balloons with this option. The value + sets locality of allowed extra CPUs that will be common to these + workloads. + - `system`: workloads are allowed to use idle CPUs available + anywhere in the system. + - `package`: ...allowed to use idle CPUs in the same package(s) + (sockets) as the balloon. + - `die`: ...in the same die(s) as the balloon. + - `numa`: ...in the same numa node(s) as the balloon. + - `core`: ...allowed to use idle CPU threads in the same cores with + the balloon. + - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU + allocator parameter, used when creating new or resizing existing + balloons. If there are balloon types with pre-created balloons + (`MinBalloons` > 0), balloons of the type with the highest + `AllocatorPriority` are created first. + +Related configuration parameters: +- `policy.ReservedResources.CPU` specifies the (number of) CPUs in the + special `reserved` balloon. By default all containers in the + `kube-system` namespace are assigned to the reserved balloon. +- `cpu.classes` defines CPU classes and their parameters (such as + `minFreq`, `maxFreq`, `uncoreMinFreq` and `uncoreMaxFreq`). + +### Example + +Example configuration that runs all pods in balloons of 1-4 CPUs. +```yaml +policy: + Active: balloons + ReservedResources: + CPU: 1 + balloons: + PinCPU: true + PinMemory: true + IdleCPUClass: lowpower + BalloonTypes: + - Name: "quad" + MinCpus: 1 + MaxCPUs: 4 + CPUClass: dynamic + Namespaces: + - "*" +cpu: + classes: + lowpower: + minFreq: 800 + maxFreq: 800 + dynamic: + minFreq: 800 + maxFreq: 3600 + turbo: + minFreq: 3000 + maxFreq: 3600 + uncoreMinFreq: 2000 + uncoreMaxFreq: 2400 +``` + +See the [sample configmap](/sample-configs/balloons-policy.cfg) for a +complete example. + +## Assigning a Container to a Balloon + +The balloon type of a container can be defined in pod annotations. In +the example below, the first annotation sets the balloon type (`BT`) +of a single container (`CONTAINER_NAME`). The last two annotations set +the default balloon type for all containers in the pod. + +```yaml +balloon.balloons.resource-policy.nri.io/container.CONTAINER_NAME: BT +balloon.balloons.resource-policy.nri.io/pod: BT +balloon.balloons.resource-policy.nri.io: BT +``` + +If a pod has no annotations, its namespace is matched to the +`Namespaces` of balloon types. The first matching balloon type is +used. + +If the namespace does not match, the container is assigned to the +special `default` balloon, that means reserved CPUs unless `MinCPUs` +or `MaxCPUs` of the `default` balloon type are explicitely defined in +the `BalloonTypes` configuration. + +## Metrics and Debugging + +In order to enable more verbose logging and metrics exporting from the +balloons policy, enable instrumentation and policy debugging from the +nri-resource-policy global config: + +```yaml +instrumentation: + # The balloons policy exports containers running in each balloon, + # and cpusets of balloons. Accessible in command line: + # curl --silent http://localhost:8891/metrics + HTTPEndpoint: :8891 + PrometheusExport: true +logger: + Debug: policy +``` diff --git a/docs/resource-policy/policy/container-affinity.md b/docs/resource-policy/policy/container-affinity.md new file mode 100644 index 000000000..d4a1ab2e2 --- /dev/null +++ b/docs/resource-policy/policy/container-affinity.md @@ -0,0 +1,267 @@ +# Container Affinity and Anti-Affinity + +## Introduction + +The topology-aware resource policy allow the user to give hints about how +particular containers should be *co-located* within a node. In particular these +hints express whether containers should be located *'close'* to each other or +*'far away'* from each other, in a hardware topology sense. + +Since these hints are interpreted always by a particular *policy implementation*, +the exact definitions of 'close' and 'far' are also somewhat *policy-specific*. +However as a general rule of thumb containers running + + - on CPUs within the *same NUMA nodes* are considered *'close'* to each other, + - on CPUs within *different NUMA nodes* in the *same socket* are *'farther'*, and + - on CPUs within *different sockets* are *'far'* from each other + +These hints are expressed by `container affinity annotations` on the Pod. +There are two types of affinities: + + - `affinity` (or `positive affinty`): cause affected containers to *pull* each other closer + - `anti-affinity` (or `negative affinity`): cause affected containers to *push* each other further away + +Policies try to place a container + - close to those the container has affinity towards + - far from those the container has anti-affinity towards. + +## Affinity Annotation Syntax + +*Affinities* are defined as the `resource-policy.nri.io/affinity` annotation. +*Anti-affinities* are defined as the `resource-manager.nri.io/anti-affinity` +annotation. They are specified in the `metadata` section of the `Pod YAML`, under +`annotations` as a dictionary, with each dictionary key being the name of the +*container* within the Pod to which the annotation belongs to. + +```yaml +metadata: + anotations: + resource-manager.nri.io/affinity: | + container1: + - scope: + key: key-ref + operator: op + values: + - value1 + ... + - valueN + match: + key: key-ref + operator: op + values: + - value1 + ... + - valueN + weight: w +``` + +An anti-affinity is defined similarly but using `resource-manager.nri.io/anti-affinity` +as the annotation key. + +```yaml +metadata: + anotations: + resource-manager.nri.io/anti-affinity: | + container1: + - scope: + key: key-ref + operator: op + values: + - value1 + ... + - valueN + match: + key: key-ref + operator: op + values: + - value1 + ... + - valueN + weight: w +``` + +## Affinity Semantics + +An affinity consists of three parts: + + - `scope expression`: defines which containers this affinity is evaluated against + - `match expression`: defines for which containers (within the scope) the affinity applies to + - `weight`: defines how *strong* a pull or a push the affinity causes + +*Affinities* are also sometimes referred to as *positive affinities* while +*anti-affinities* are referred to as *negative affinities*. The reason for this is +that the only difference between these are that affinities have a *positive weight* +while anti-affinities have a *negative weight*. + +The *scope* of an affinity defines the *bounding set of containers* the affinity can +apply to. The affinity *expression* is evaluated against the containers *in scope* and +it *selects the containers* the affinity really has an effect on. The *weight* specifies +whether the effect is a *pull* or a *push*. *Positive* weights cause a *pull* while +*negative* weights cause a *push*. Additionally, the *weight* specifies *how strong* the +push or the pull is. This is useful in situations where the policy needs to make some +compromises because an optimal placement is not possible. The weight then also acts as +a way to specify preferences of priorities between the various compromises: the heavier +the weight the stronger the pull or push and the larger the propbability that it will be +honored, if this is possible at all. + +The scope can be omitted from an affinity in which case it implies *Pod scope*, in other +words the scope of all containers that belong to the same Pod as the container for which +which the affinity is defined. + +The weight can also be omitted in which case it defaults to -1 for anti-affinities +and +1 for affinities. Weights are currently limited to the range [-1000,1000]. + +Both the affinity scope and the expression select containers, therefore they are identical. +Both of them are *expressions*. An expression consists of three parts: + + - key: specifies what *metadata* to pick from a container for evaluation + - operation (op): specifies what *logical operation* the expression evaluates + - values: a set of *strings* to evaluate the the value of the key against + +The supported keys are: + + - for pods: + - `name` + - `namespace` + - `qosclass` + - `labels/` + - `id` + - `uid` + - for containers: + - `pod/` + - `name` + - `namespace` + - `qosclass` + - `labels/` + - `tags/` + - `id` + +Essentially an expression defines a logical operation of the form (key op values). +Evaluating this logical expression will take the value of the key in which +either evaluates to true or false. +a boolean true/false result. Currently the following operations are supported: + + - `Equals`: equality, true if the *value of key* equals the single item in *values* + - `NotEqual`: inequality, true if the *value of key* is not equal to the single item in *values* + - `In`: membership, true if *value of key* equals to any among *values* + - `NotIn`: negated membership, true if the *value of key* is not equal to any among *values* + - `Exists`: true if the given *key* exists with any value + - `NotExists`: true if the given *key* does not exist + - `AlwaysTrue`: always evaluates to true, can be used to denote node-global scope (all containers) + - `Matches`: true if the *value of key* matches the globbing pattern in values + - `MatchesNot`: true if the *value of key* does not match the globbing pattern in values + - `MatchesAny`: true if the *value of key* matches any of the globbing patterns in values + - `MatchesNone`: true if the *value of key* does not match any of the globbing patterns in values + +The effective affinity between containers C_1 and C_2, A(C_1, C_2) is the sum of the +weights of all pairwise in-scope matching affinities W(C_1, C_2). To put it another way, +evaluating an affinity for a container C_1 is done by first using the scope (expression) +to determine which containers are in the scope of the affinity. Then, for each in-scope +container C_2 for which the match expression evaluates to true, taking the weight of the +affinity and adding it to the effective affinity A(C_1, C_2). + +Note that currently (for the topology-aware policy) this evaluation is asymmetric: +A(C_1, C_2) and A(C_2, C_1) can and will be different unless the affinity annotations are +crafted to prevent this (by making them fully symmetric). Moreover, A(C_1, C_2) is calculated +and taken into consideration during resource allocation for C_1, while A(C_2, C_1) +is calculated and taken into account during resource allocation for C_2. This might be +changed in a future version. + + +Currently affinity expressions lack support for boolean operators (and, or, not). +Sometimes this limitation can be overcome by using joint keys, especially with +matching operators. The joint key syntax allows joining the value of several keys +with a separator into a single value. A joint key can be specified in a simple or +full format: + + - simple: ``, this is equivalent to `:::` + - full: `` + +A joint key evaluates to the values of all the ``-separated subkeys joined by ``. +A non-existent subkey evaluates to the empty string. For instance the joint key + + `:pod/qosclass:pod/name:name` + +evaluates to + + `::` + +For existence operators, a joint key is considered to exist if any of its subkeys exists. + + +## Examples + +Put the container `peter` close to the container `sheep` but far away from the +container `wolf`. + +```yaml +metadata: + annotations: + resource-manager.nri.io/affinity: | + peter: + - match: + key: name + operator: Equals + values: + - sheep + weight: 5 + resource-manager.nri.io/anti-affinity: | + peter: + - match: + key: name + operator: Equals + values: + - wolf + weight: 5 +``` + +## Shorthand Notation + +There is an alternative shorthand syntax for what is considered to be the most common +case: defining affinities between containers within the same pod. With this notation +one needs to give just the names of the containers, like in the example below. + +```yaml + annotations: + resource-manager.nri.io/affinity: | + container3: [ container1 ] + resource-manager.nri.io/anti-affinity: | + container3: [ container2 ] + container4: [ container2, container3 ] +``` + + +This shorthand notation defines: + - `container3` having + - affinity (weight 1) to `container1` + - `anti-affinity` (weight -1) to `container2` + - `container4` having + - `anti-affinity` (weight -1) to `container2`, and `container3` + +The equivalent annotation in full syntax would be + +```yaml +metadata: + annotations: + resource-manager.nri.io/affinity: |+ + container3: + - match: + key: labels/io.kubernetes.container.name + operator: In + values: + - container1 + resource-manager.nri.io/anti-affinity: |+ + container3: + - match: + key: labels/io.kubernetes.container.name + operator: In + values: + - container2 + container4: + - match: + key: labels/io.kubernetes.container.name + operator: In + values: + - container2 + - container3 +``` diff --git a/docs/resource-policy/policy/cpu-allocator.md b/docs/resource-policy/policy/cpu-allocator.md new file mode 100644 index 000000000..8d7eb0419 --- /dev/null +++ b/docs/resource-policy/policy/cpu-allocator.md @@ -0,0 +1,61 @@ +# CPU Allocator + +NRI Resource Policy has a separate CPU allocator component that helps policies +make educated allocation of CPU cores for workloads. Currently all policies +utilize the built-in CPU allocator. See policy specific documentation for more +details. + +## Topology Based Allocation + +The CPU allocator tries to optimize the allocation of CPUs in terms of the +hardware topology. More specifically, it aims at packing all CPUs of one +request "near" each other in order to minimize memory latencies between CPUs. + +## CPU Prioritization + +The CPU allocator also does automatic CPU prioritization by detecting CPU +features and their configuration parameters. Currently, NRI Resource Policy +supports CPU priority detection based on the `intel_pstate` scaling +driver in the Linux CPUFreq subsystem, and, Intel Speed Select Technology +(SST). + +CPUs are divided into three priority classes, i.e. *high*, *normal* and *low*. +Policies utilizing the CPU allocator may choose to prefer certain priority +class for certain types of workloads. For example, prefer (and preserve) high +priority CPUs for high priority workloads. + +### Intel Speed Select Technology (SST) + +NRI Resource Policy supports detection of all Intel Speed Select Technology +(SST) features, i.e. Speed Select Technology Performance Profile (SST-PP), Base +Frequency (SST-BF), Turbo Frequency (SST-TF) and Core Power (SST-CP). + +CPU prioritization is based on detection of the currently active SST features +and their parameterization: + +1. If SST-TF has been enabled, all CPUs prioritized by SST-TF are flagged as + high priority. +1. If SST-CP is enabled but SST-TF disabled, the CPU allocator examines the + active Classes of Service (CLOSes) and their parameters. CPUs associated + with the highest priority CLOS will be flagged as high priority, lowest + priority CLOS will be flagged as low priority and possible "middle priority" + CLOS as normal priority. +1. If SST-BF has been enabled and SST-TF and SST-CP are inactive, all BF high + priority cores (having higher guaranteed base frequency) will be flagged + as high priority. + +### Linux CPUFreq + +CPUFreq based prioritization only takes effect if Intel Speed Select Technology +(SST) is disabled (or not supported). NRI-RM divides CPU cores into priority +classes based on two parameters: + +- base frequency +- EPP (Energy-Performance Preference) + +CPU cores with high base frequency (relative to the other cores in the system) +will be flagged as high priority. Low base frequency will map to low priority, +correspondingly. + +CPU cores with high EPP priority (relative to the other cores in the system) +will be marked as high priority cores. diff --git a/docs/resource-policy/policy/index.rst b/docs/resource-policy/policy/index.rst new file mode 100644 index 000000000..35bd500de --- /dev/null +++ b/docs/resource-policy/policy/index.rst @@ -0,0 +1,10 @@ +Policies +######## + +.. toctree:: + :maxdepth: 1 + + topology-aware.md + balloons.md + container-affinity.md + cpu-allocator.md diff --git a/docs/resource-policy/policy/topology-aware.md b/docs/resource-policy/policy/topology-aware.md new file mode 100644 index 000000000..9904e891d --- /dev/null +++ b/docs/resource-policy/policy/topology-aware.md @@ -0,0 +1,432 @@ +# Topology-Aware Policy + +## Background + +On server-grade hardware the CPU cores, I/O devices and other peripherals +form a rather complex network together with the memory controllers, the +I/O bus hierarchy and the CPU interconnect. When a combination of these +resources are allocated to a single workload, the performance of that +workload can vary greatly, depending on how efficiently data is transferred +between them or, in other words, on how well the resources are aligned. + +There are a number of inherent architectural hardware properties that, +unless properly taken into account, can cause resource misalignment and +workload performance degradation. There are a multitude of CPU cores +available to run workloads. There are a multitude of memory controllers +these workloads can use to store and retrieve data from main memory. There +are a multitude of I/O devices attached to a number of I/O buses the same +workloads can access. The CPU cores can be divided into a number of groups, +with each group having different access latency and bandwidth to each +memory controller and I/O device. + +If a workload is not assigned to run with a properly aligned set of CPU, +memory and devices, it will not be able to achieve optimal performance. +Given the idiosyncrasies of hardware, allocating a properly aligned set +of resources for optimal workload performance requires identifying and +understanding the multiple dimensions of access latency locality present +in hardware or, in other words, hardware topology awareness. + +## Overview + +The `topology-aware` policy automatically builds a tree of pools based on the +detected hardware topology. Each pool has a set of CPUs and memory zones +assigned as their resources. Resource allocation for workloads happens by +first picking the pool which is considered to fit the best the resource +requirements of the workload and then assigning CPU and memory from this pool. + +The pool nodes at various depths from bottom to top represent the NUMA nodes, +dies, sockets, and finally the whole of the system at the root node. Leaf NUMA +nodes are assigned the memory behind their controllers / zones and CPU cores +with the smallest distance / access penalty to this memory. If the machine +has multiple types of memory separately visible to both the kernel and user +space, for instance both DRAM and +[PMEM](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html), +each zone of special type of memory is assigned to the closest NUMA node pool. + +Each non-leaf pool node in the tree is assigned the union of the resources of +its children. So in practice, dies nodes end up containing all the CPU cores +and the memory zones in the corresponding die, sockets nodes end up containing +the CPU cores and memory zones in the corresponding socket's dies, and the root +ends up containing all CPU cores and memory zones in all sockets. + +With this setup, each pool in the tree has a topologically aligned set of CPU +and memory resources. The amount of available resources gradually increases in +the tree from bottom to top, while the strictness of alignment is gradually +relaxed. In other words, as one moves from bottom to top in the tree, it is +getting gradually easier to fit in a workload, but the price paid for this is +a gradually increasing maximum potential cost or penalty for memory access and +data transfer between CPU cores. + +Another property of this setup is that the resource sets of sibling pools at +the same depth in the tree are disjoint while the resource sets of descendant +pools along the same path in the tree partially overlap, with the intersection +decreasing as the the distance between pools increases. This makes it easy to +isolate workloads from each other. As long as workloads are assigned to pools +which has no other common ancestor than the root, the resources of these +workloads should be as well isolated from each other as possible on the given +hardware. + +With such an arrangement, this policy should handle topology-aware alignment +of resources without any special or extra configuration. When allocating +resources, the policy + + - filters out all pools with insufficient free capacity + - runs a scoring algorithm for the remaining ones + - picks the one with the best score + - assigns resources to the workload from there + +Although the details of the scoring algorithm are subject to change as the +implementation evolves, its basic principles are roughly + + - prefer pools lower in the tree, IOW stricter alignment and lower latency + - prefer idle pools over busy ones, IOW more remaining free capacity and + fewer workloads + - prefer pools with better overall device alignment + +## Features + +The `topology-aware` policy has the following features: + + - topologically aligned allocation of CPU and memory + * assign CPU and memory to workloads with tightest available alignment + - aligned allocation of devices + * pick pool for workload based on locality of devices already assigned + - shared allocation of CPU cores + * assign workload to shared subset of pool CPUs + - exclusive allocation of CPU cores + * dynamically slice off CPU cores from shared subset and assign to workload + - mixed allocation of CPU cores + * assign both exclusive and shared CPU cores to workload + - discovering and using kernel-isolated CPU cores (['isolcpus'](https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists)) + * use kernel-isolated CPU cores for exclusively assigned CPU cores + - exposing assigned resources to workloads + - notifying workloads about changes in resource assignment + - dynamic relaxation of memory alignment to prevent OOM + * dynamically widen workload memory set to avoid pool/workload OOM + - multi-tier memory allocation + * assign workloads to memory zones of their preferred type + * the policy knows about three kinds of memory: + - DRAM is regular system main memory + - PMEM is large-capacity memory, such as + [IntelĀ® Optaneā„¢ memory](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html) + - [HBM](https://en.wikipedia.org/wiki/High_Bandwidth_Memory) is high speed memory, + typically found on some special-purpose computing systems + - cold start + * pin workload exclusively to PMEM for an initial warm-up period + - dynamic page demotion + * forcibly migrate read-only and idle container memory pages to PMEM + +## Activating the Policy + +You can activate the `topology-aware` policy by using the following configuration +fragment in the configuration for `nri-resource-policy-topology-aware`: + +```yaml +policy: + Active: topology-aware + ReservedResources: + CPU: 750m +``` + +## Configuring the Policy + +The policy has a number of configuration options which affect its default behavior. +These options can be supplied as part of the +[dynamic configuration](../setup.md#using-nri-resource-policy-agent-and-a-configmap) +received via the [`node agent`](../node-agent.md), or in a fallback or forced +configuration file. These configuration options are + + - `PinCPU` + * whether to pin workloads to assigned pool CPU sets + - `PinMemory` + * whether to pin workloads to assigned pool memory zones + - `PreferIsolatedCPUs` + * whether isolated CPUs are preferred by default for workloads that are + eligible for exclusive CPU allocation + - `PreferSharedCPUs` + * whether shared allocation is preferred by default for workloads that + would be otherwise eligible for exclusive CPU allocation + - `ReservedPoolNamespaces` + * list of extra namespaces (or glob patters) that will be allocated to reserved CPUs + - `ColocatePods` + * whether try to allocate containers in a pod to the same or close by topology pools + - `ColocateNamespaces` + * whether try to allocate containers in a namespace to the same or close by topology pools + +## Policy CPU Allocation Preferences + +There are a number of workload properties this policy actively checks to decide +if the workload could potentially benefit from extra resource allocation +optimizations. Unless configured differently, containers fulfilling certain +corresponding criteria are considered eligible for these optimizations. This +will be reflected in the assigned resources whenever that is possible at the +time the container's creation / resource allocation request hits the policy. + +The set of these extra optimizations consist of + + - assignment of `kube-reserved` CPUs + - assignment of exclusively allocated CPU cores + - usage of kernel-isolated CPU cores (for exclusive allocation) + +The policy uses a combination of the QoS class and the resource requirements of +the container to decide if any of these extra allocation preferences should be +applied. Containers are divided into five groups, with each group having a +slightly different set of criteria for eligibility. + + - `kube-system` group + * all containers in the `kube-system` namespace + - `low-priority` group + * containers in the `BestEffort` or `Burstable` QoS class + - `sub-core` group + * Guaranteed QoS class containers with `CPU request < 1 CPU` + - `mixed` group + * Guaranteed QoS class containers with `1 <= CPU request < 2` + - `multi-core` group + * Guaranteed QoS class containers with `CPU request >= 2` + +The eligibility rules for extra optimization are slightly different among these +groups. + + - `kube-system` + * not eligible for extra optimizations + * eligible to run on `kube-reserved` CPU cores + * always run on shared CPU cores + - `low-priority` + * not eligible for extra optimizations + * always run on shared CPU cores + - `sub-core` + * not eligible for extra optimizations + * always run on shared CPU cores + - `mixed` + * by default eligible for exclusive and isolated allocation + * not eligible for either if `PreferSharedCPUs` is set to true + * not eligible for either if annotated to opt out from exclusive allocation + * not eligible for isolated allocation if annotated to opt out + - `multi-core` + * CPU request fractional (`(CPU request % 1000 milli-CPU) != 0`): + - by default not eligible for extra optimizations + - eligible for exclusive and isolated allocation if annotated to opt in + * CPU request not fractional: + - by default eligible for exclusive allocation + - by default not eligible for isolated allocation + - not eligible for exclusive allocation if annotated to opt out + - eligible for isolated allocation if annotated to opt in + +Eligibility for kube-reserved CPU core allocation should always be possible to +honor. If this is not the case, it is probably due to an incorrect configuration +which underdeclares `ReservedResources`. In that case, ordinary shared CPU cores +will be used instead of kube-reserved ones. + +Eligibility for exclusive CPU allocation should always be possible to honor. +Eligibility for isolated core allocation is only honored if there are enough +isolated cores available to fulfill the exclusive part of the container's CPU +request with isolated cores alone. Otherwise ordinary CPUs will be allocated, +by slicing them off for exclusive usage from the shared subset of CPU cores in +the container's assigned pool. + +Containers in the kube-system group are pinned to share all kube-reserved CPU +cores. Containers in the low-priority or sub-core groups, and containers which +are only eligible for shared CPU core allocation in the mixed and multi-core +groups, are all pinned to run on the shared subset of CPU cores in the +container's assigned pool. This shared subset can and usually does change +dynamically as exclusive CPU cores are allocated and released in the pool. + +## Container CPU Allocation Preference Annotations + +Containers can be annotated to diverge from the default CPU allocation +preferences the policy would otherwise apply to them. These Pod annotations +can be given both with per pod and per container resolution. If for any +container both of these exist, the container-specific one takes precedence. + +### Shared, Exclusive, and Isolated CPU Preference + +A container can opt in to or opt out from shared CPU allocation using the +following Pod annotation. + +```yaml +metadata: + annotations: + # opt in container C1 to shared CPU core allocation + prefer-shared-cpus.resource-policy.nri.io/container.C1: "true" + # opt in the whole pod to shared CPU core allocation + prefer-shared-cpus.resource-policy.nri.io/pod: "true" + # selectively opt out container C2 from shared CPU core allocation + prefer-shared-cpus.resource-policy.nri.io/container.C2: "false" +``` + +Opting in to exclusive allocation happens by opting out from shared allocation, +and opting out from exclusive allocation happens by opting in to shared +allocation. + +A container can opt in to or opt out from isolated exclusive CPU core +allocation using the following Pod annotation. + +```yaml +metadata: + annotations: + # opt in container C1 to isolated exclusive CPU core allocation + prefer-isolated-cpus.resource-policy.nri.io/container.C1: "true" + # opt in the whole pod to isolated exclusive CPU core allocation + prefer-isolated-cpus.resource-policy.nri.io/pod: "true" + # selectively opt out container C2 from isolated exclusive CPU core allocation + prefer-isolated-cpus.resource-policy.nri.io/container.C2: "false" +``` + +These Pod annotations have no effect on containers which are not eligible for +exclusive allocation. + +### Implicit Hardware Topology Hints + +`NRI Resource Policy` automatically generates HW `Topology Hints` for devices +assigned to a container, prior to handing the container off to the active policy +for resource allocation. The `topology-aware` policy is hint-aware and normally +takes topology hints into account when picking the best pool to allocate resources. +Hints indicate optimal `HW locality` for device access and they can alter +significantly which pool gets picked for a container. + +Since device topology hints are implicitly generated, there are cases where one +would like the policy to disregard them altogether. For instance, when a local +volume is used by a container but not in any performance critical manner. + +Containers can be annotated to opt out from and selectively opt in to hint-aware +pool selection using the following Pod annotations. + +```yaml +metadata: + annotations: + # only disregard hints for container C1 + topologyhints.resource-policy.nri.io/container.C1: "false" + # disregard hints for all containers by default + topologyhints.resource-policy.nri.io/pod: "false" + # but take hints into account for container C2 + topologyhints.resource-policy.nri.io/container.C2: "true" +``` + +Topology hint generation is globally enabled by default. Therefore, using the +Pod annotation as opt in only has an effect when the whole pod is annotated to +opt out from hint-aware pool selection. + +### Implicit Topological Co-location for Pods and Namespaces + +The `ColocatePods` or `ColocateNamespaces` configuration options control whether +the policy will try to co-locate, that is allocate topologically close, containers +within the same Pod or K8s namespace. + +Both of these options are false by default. Setting them to true is a shorthand +for adding to each container an affinity of weight 10 for all other containers +in the same pod or namespace. + +Containers with user-defined affinities are never extended with either of these +co-location affinities. However, such containers can still have affinity effects +on other containers that do get extended with co-location. Therefore mixing user- +defined affinities with implicit co-location requires both careful consideration +and a thorough understanding of affinity evaluation, or it should be avoided +altogether. + +## Cold Start + +The `topology-aware` policy supports "cold start" functionality. When cold start +is enabled and the workload is allocated to a topology node with both DRAM and +PMEM memory, the initial memory controller is only the PMEM controller. DRAM +controller is added to the workload only after the cold start timeout is +done. The effect of this is that allocated large unused memory areas of +memory don't need to be migrated to PMEM, because it was allocated there to +begin with. Cold start is configured like this in the pod metadata: + +```yaml +metadata: + annotations: + memory-type.resource-policy.nri.io/container.container1: dram,pmem + cold-start.resource-policy.nri.io/container.container1: | + duration: 60s +``` + +Again, alternatively you can use the following deprecated Pod annotation syntax +to achieve the same, but support for this syntax is subject to be dropped in a +future release: + +```yaml +metadata: + annotations: + resource-policy.nri.io/memory-type: | + container1: dram,pmem + resource-policy.nri.io/cold-start: | + container1: + duration: 60s +``` + +In the above example, `container1` would be initially granted only PMEM +memory controller, but after 60 seconds the DRAM controller would be +added to the container memset. + +## Dynamic Page Demotion + +The `topology-aware` policy also supports dynamic page demotion. With dynamic +demotion enabled, rarely-used pages are periodically moved from DRAM to PMEM +for those workloads which are assigned to use both DRAM and PMEM memory types. +The configuration for this feature is done using three configuration keys: +`DirtyBitScanPeriod`, `PageMovePeriod`, and `PageMoveCount`. All of these +parameters need to be set to non-zero values in order for dynamic page demotion +to get enabled. See this configuration file fragment as an example: + +```yaml +policy: + Active: topology-aware + topology-aware: + DirtyBitScanPeriod: 10s + PageMovePeriod: 2s + PageMoveCount: 1000 +``` + +In this setup, every pid in every container in every non-system pod +fulfilling the memory container requirements would have their page ranges +scanned for non-accessed pages every ten seconds. The result of the scan +would be fed to a page-moving loop, which would attempt to move 1000 pages +every two seconds from DRAM to PMEM. + +## Container memory requests and limits + +Due to inaccuracies in how `nri-resource-policy` calculates memory requests for +pods in QoS class `Burstable`, you should either use `Limit` for setting +the amount of memory for containers in `Burstable` pods to provide `cri-resmgr` +with an exact copy of the resource requirements from the Pod Spec as an extra +Pod annotation. + +## Reserved pool namespaces + +User is able to mark certain namespaces to have a reserved CPU allocation. +Containers belonging to such namespaces will only run on CPUs set aside +according to the global CPU reservation, as configured by the ReservedResources +configuration option in the policy section. +The `ReservedPoolNamespaces` option is a list of namespace globs that will be +allocated to reserved CPU class. + +For example: + +```yaml +policy: + Active: topology-aware + topology-aware: + ReservedPoolNamespaces: ["my-pool","reserved-*"] +``` + +In this setup, all the workloads in `my-pool` namespace and those namespaces +starting with `reserved-` string are allocated to reserved CPU class. +The workloads in `kube-system` are automatically assigned to reserved CPU +class so no need to mention `kube-system` in this list. + +## Reserved CPU annotations + +User is able to mark certain pods and containers to have a reserved CPU allocation +by using annotations. Containers having a such annotation will only run on CPUs set +aside according to the global CPU reservation, as configured by the ReservedResources +configuration option in the policy section. + +For example: + +```yaml +metadata: + annotations: + prefer-reserved-cpus.resource-policy.nri.io/pod: "true" + prefer-reserved-cpus.resource-policy.nri.io/container.special: "false" +``` diff --git a/docs/resource-policy/setup.md b/docs/resource-policy/setup.md new file mode 100644 index 000000000..6eee35be9 --- /dev/null +++ b/docs/resource-policy/setup.md @@ -0,0 +1,61 @@ +# Setup and Usage + +When you want to try NRI Resource Policy, here is the list of things +you need to do, assuming you already have a Kubernetes\* cluster up and +running, using either `containerd` or `cri-o` as the runtime. + + * [Install](installation.md) NRI Resource Policy DaemonSet deployment file. + * Runtime (containerd / cri-o) configuration + +For NRI Resource Policy, you need to provide a configuration file. The default +configuration ConfigMap file can be found in the DaemonSet deployment yaml file. +You can edit it as needed. + +**NOTE**: Currently, the available policies are a work in progress. + +## Setting up NRI Resource Policy + +### Using NRI Resource Policy Agent and a ConfigMap + +The [NRI Resource Policy Node Agent][agent] can monitor and fetch configuration +from the ConfigMap and pass it on to NRI Resource Policy plugin. +By default, it automatically tries to use the agent to acquire configuration, +unless you override this by forcing a static local configuration using +the `--force-config ` option. +When using the agent, it is also possible to provide an initial fallback for +configuration using the `--fallback-config `. This file is +used before the very first configuration is successfully acquired from the +agent. + +See the [Node Agent][agent] about how to set up and configure the agent. + + +## Logging and debugging + +You can control logging with the klog command line options or by setting the +corresponding environment variables. You can get the name of the environment +variable for a command line option by prepending the `LOGGER_` prefix to the +capitalized option name without any leading dashes. For instance, setting the +environment variable `LOGGER_SKIP_HEADERS=true` has the same effect as using +the `-skip_headers` command line option. + +Additionally, the `LOGGER_DEBUG` environment variable controls debug logs. +These are globally disabled by default. You can turn on full debugging by +setting `LOGGER_DEBUG='*'`. + +When using environment variables, be careful which configuration you pass to +NRI Resource Policy using a file or ConfigMap. The environment is treated +as default configuration but a file or a ConfigMap has higher precedence. +If something is configured in both, the environment will only be in effect +until the configuration is applied. However, in such a case if you later +push an updated configuration to NRI Resource Policy with the overlapping +settings removed, the original ones from the environment will be in effect +again. + +For debug logs, the settings from the configuration are applied in addition +to any settings in the environment. That said, if you turn something on in +the environment but off in the configuration, it will be turned off +eventually. + + +[agent]: node-agent.md diff --git a/sample-configs/balloons-policy.cfg b/sample-configs/balloons-policy.cfg new file mode 100644 index 000000000..8ad17682b --- /dev/null +++ b/sample-configs/balloons-policy.cfg @@ -0,0 +1,94 @@ +policy: + Active: balloons + # Use only 15 CPUs in total, leave cpu0 for other than Kubernetes + # processes. + AvailableResources: + CPU: cpuset:1-15 + # Reserve one of our CPUs (cpu15) for kube-system tasks. + ReservedResources: + CPU: cpuset:15 + balloons: + # PinCPU: allow containers to use only the CPUs in their balloons. + PinCPU: true + # PinMemory: allow containers to use only the closest memory to + # the CPUs in their balloons. + PinMemory: true + # IdleCPUClass: how to configure CPUs that are not included in any + # of the balloons. + IdleCPUClass: idle + BalloonTypes: + - Name: "full-core-turbo" + # MinCPUs: minimum number of logical cores in every balloon + # instance of this type. + # The default is 0. + MinCPUs: 2 + # MaxCPUs: maximum number of logical cores in every balloon + # instance of this type. + # The default is 0 (unlimited). + MaxCPUs: 2 + # CPUClass: how to configure CPUs of these balloons. + # The default is "". + CPUClass: "turbo" + # Namespaces: assign pods in listed namespaces to these + # balloons, even if there is no explicit annotation: + # balloon.balloons.nri-resmgr.intel.com: full-core-turbo + # The default is to assign only annotated pods. + Namespaces: + - "highperf" + # AllocatorPriotity: CPU allocator priority (0: High, 1: + # Normal, 2: Low, 3: None). Affects the performance/type of + # CPUs that are selected into the balloon. CPUs for static + # balloon instances (MinBalloons > 0) with highest + # AllocatorPriority are reserved first. + # The default is 0. + AllocatorPriority: 2 + # MinBalloons: how many balloon instances of this type are always + # kept in the system, even if there would not be workloads to them. + # The default is 0. + MinBalloons: 2 + # PreferNewBalloons: prefer creating a new balloon for + # separate pods, even if their CPU requirements would allow + # putting them in the same balloon. + # The default is: false. + PreferNewBalloons: true + # PreferPerNamespaceBalloon: if true, containers in the same + # namespace are preferrably placed in the same balloon, and + # containers in different namespaces to different + # balloons. The default is false: namespaces have no effect on + # placement. + PreferPerNamespaceBalloon: false + # PreferSpreadingPods: if true, containers of single pod can + # be assigned in different balloons, based on which balloons + # have most free CPU resources. + # The default is: false: prefer running containers of a same + # pod in the same balloon(s). + PreferSpreadingPods: false + + - Name: "socket-size" + MaxCPUs: 8 + AllocatorPriority: 2 + Namespaces: + - "default" + CPUClass: "normal" +# CPU controller configuration specifies CPU class properties. CPUs of +# each balloon are configured based on its CPUClass. If a balloon has +# no CPUClass, the properties of the default class are applied. +cpu: + classes: + default: + minFreq: 800 + maxFreq: 1600 + turbo: + minFreq: 3300 + maxFreq: 3600 + normal: + minFreq: 800 + maxFreq: 2400 +instrumentation: + # The balloons policy exports containers running in each balloon, + # and cpusets of balloons. Accessible in command line: + # curl --silent http://localhost:8891/metrics + HTTPEndpoint: :8891 + PrometheusExport: true +logger: + Debug: policy diff --git a/sample-configs/blockio.cfg b/sample-configs/blockio.cfg new file mode 100644 index 000000000..9236e01a0 --- /dev/null +++ b/sample-configs/blockio.cfg @@ -0,0 +1,64 @@ +# This configuration demonstrates how to configure cgroups block io +# controller for pods. +# +# The configuration defines block device parameters for three blockio +# classes (LowPrioThrottled, HighPrioFullSpeed and Default, feel free +# to choose any names here). Finally resource-manager.blockio maps QOS +# classes BestEffort, Burstable (via wildcard), and Guaranteed to +# these classes. +# +# Try with: nri-resource-policy-topology-aware -force-config blockio.cfg + +logger: + Debug: blockio,cgroupblkio + +blockio: + Classes: + # LowPrioThrottled and HighPrioFullSpeed are user-defined blockio classes + # in this example. Pods and containers can be assigned to these classes using Pod + # metadata annotations. For example in Pod yaml: + # ... + # metadata: + # annotations: + # # Default blockio class for containers in the pod: + # blockioclass.cri-resource-manager.intel.com/pod: LowPrioThrottled + # # Special blockio class for a container in the pod: + # blockioclass.cri-resource-manager.intel.com/container.mycontainer: HighPrioFullSpeed + LowPrioThrottled: + # Default io-scheduler weight for all devices that are not + # explicitly mentioned in following items. + - Weight: 80 # will be written to cgroups(.bfq).weight + + # Configuration for all virtio and scsi block devices. + - Devices: + - /dev/vd* + - /dev/sd* + ThrottleReadBps: 50M # max read bytes per second + ThrottleWriteBps: 10M # max write bytes per second + ThrottleReadIOPS: 10k # max read io operations per second + ThrottleWriteIOPS: 5k # max write io operations per second + Weight: 50 # io-scheduler (cfq/bfq) weight for these devices, + # will be written to cgroups(.bfq).weight_device + + # Configuration for SSD devices. + # This overrides above configuration for those /dev/sd* devices + # whose disk id contains "SSD" + - Devices: + - /dev/disk/by-id/*SSD* + ThrottleReadBps: 100M + ThrottleWriteBps: 40M + # Not mentioning Throttle*IOPS means no io operations throttling for matching devices. + Weight: 50 + + HighPrioFullSpeed: + - Weight: 400 + + # When Pod annotations do not define blockio class, QoS class + # names (BestEffort, Burstable, Guaranteed) are used as blockio + # class names for the pod. By default no blockio configuration + # takes place for them, but here we define I/O scheduler weight + # difference: + BestEffort: + - Weight: 90 + Guaranteed: + - Weight: 200 diff --git a/sample-configs/nri-resource-policy-configmap.example.yaml b/sample-configs/nri-resource-policy-configmap.example.yaml new file mode 100644 index 000000000..c40f32733 --- /dev/null +++ b/sample-configs/nri-resource-policy-configmap.example.yaml @@ -0,0 +1,304 @@ +# +# This example creates 3 ConfigMaps: +# - nri-resmgr-config.default: the default configuration +# - nri-resmgr-config.group.foo: the configuration for nodes in group foo +# - nri-resmgr-config.node.cl0-slave1: the configuration for node cl0-slave1 +# +# You can assign nodes to group foo using the command +# kubectl label --overwrite node $NODE_NAME nri-resmgr.intel.com/group=foo +# +# You can remove nodes from group foo using the command +# kubectl label node $NODE_NAME nri-resmgr.intel.com/group- +# + +apiVersion: v1 +kind: ConfigMap +metadata: + name: nri-resmgr-config.default + namespace: kube-system +data: + policy: |+ + Active: topology-aware + AvailableResources: + cpu: cpuset:0-63 + ReservedResources: + cpu: cpuset:0-1 + topology-aware: + PinCPU: true + PinMemory: true + PreferIsolatedCPUs: true + PreferSharedCPUs: false + static: + RelaxedIsolation: true + static-pools: + # Filesystem path to legacy configuration directory structure + ConfDirPath: "/etc/cmk" + # Filesystem path to legacy configuration file + ConfFilePath: "" + # Whether to create CMK node label + LabelNode: false + # Whether to create CMK node taint + TaintNode: false + # Pool configuration. + # The imaginary example system below consists of 4 sockets, 4 cores, 2 + # threads each. + pools: + exclusive: + # 6 exclusive cores, 3 on sockets 1, 2 and 3 each + cpuLists: + - Cpuset: 8,9 + Socket: 1 + - Cpuset: 10,11 + Socket: 1 + - Cpuset: 16,17 + Socket: 2 + - Cpuset: 18,19 + Socket: 2 + - Cpuset: 24,25 + Socket: 3 + - Cpuset: 26,27 + Socket: 3 + exclusive: true + shared: + # 2 cores in shared pool, all on socket 1 + cpuLists: + - Cpuset: 12,13,14,15 + Socket: 1 + exclusive: false + infra: + # Rest of cores designated to infra pool + cpuLists: + - Cpuset: 0,1,2,3,4,5,6,7 + Socket: 0 + - Cpuset: 20,21,22,23 + Socket: 2 + - Cpuset: 28,29,30,31 + Socket: 3 + exclusive: false + rdt: |+ + # Common options + options: + # One of Full, Discovery or Disabled + mode: Full + # Set to true to disable creation of monitoring groups + monitoringDisabled: false + l3: + # Make this false if L3 CAT must be available + optional: true + mb: + # Make this false if MBA must be available + optional: true + + # Configuration of classes + partitions: + exclusive: + # Allocate 60% of all L3 cache to the "exclusive" partition + l3Allocation: "60%" + mbAllocation: ["100%"] + classes: + Guaranteed: + # Allocate all of the partitions cache lines to "Guaranteed" + l3Allocation: "100%" + shared: + # Allocate 40% L3 cache IDs to the "shared" partition + # These will NOT overlap with the cache lines allocated for "exclusive" partition + l3Allocation: "40%" + mbAllocation: ["50%"] + classes: + Burstable: + # Allow "Burstable" to use all cache lines of the "shared" partition + l3Allocation: "100%" + BestEffort: + # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition. + # These will overlap with those used by "Burstable" + l3Allocation: "50%" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + ## Configuration specific to a group of nodes can be specified with + #name: nri-resmgr-config.group. + namespace: kube-system +data: + policy: |+ + Active: topology-aware + AvailableResources: + cpu: cpuset:0-63 + ReservedResources: + cpu: cpuset:0-1 + topology-aware: + PinCPU: true + PinMemory: false + PreferIsolatedCPUs: false + PreferSharedCPUs: false + static: + RelaxedIsolation: true + static-pools: + # This is an example configuration for static-pools policy. + # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each. + pools: + exclusive: + # 6 exclusive cores, 3 on sockets 1, 2 and 3 each + cpuLists: + - Cpuset: 8,9 + Socket: 1 + - Cpuset: 10,11 + Socket: 1 + - Cpuset: 16,17 + Socket: 2 + - Cpuset: 18,19 + Socket: 2 + - Cpuset: 24,25 + Socket: 3 + - Cpuset: 26,27 + Socket: 3 + exclusive: true + shared: + # 2 cores in shared pool, all on socket 1 + cpuLists: + - Cpuset: 12,13,14,15 + Socket: 1 + exclusive: false + infra: + # Rest of cores designated to infra pool + cpuLists: + - Cpuset: 0,1,2,3,4,5,6,7 + Socket: 0 + - Cpuset: 20,21,22,23 + Socket: 2 + - Cpuset: 28,29,30,31 + Socket: 3 + exclusive: false + rdt: |+ + # Common options + options: + # One of Full, Discovery or Disabled + mode: Full + # Set to true to disable creation of monitoring groups + monitoringDisabled: false + l3: + # Make this false if L3 CAT must be available + optional: true + mb: + # Make this false if MBA must be available + optional: true + + # Configuration of classes + partitions: + exclusive: + # Allocate 60% of all L3 cache to the "exclusive" partition + l3Allocation: "60%" + mbAllocation: ["100%"] + classes: + Guaranteed: + # Allocate all of the partitions cache lines to "Guaranteed" + l3Allocation: "100%" + shared: + # Allocate 40% L3 cache IDs to the "shared" partition + # These will NOT overlap with the cache lines allocated for "exclusive" partition + l3Allocation: "40%" + mbAllocation: ["50%"] + classes: + Burstable: + # Allow "Burstable" to use all cache lines of the "shared" partition + l3Allocation: "100%" + BestEffort: + # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition. + # These will overlap with those used by "Burstable" + l3Allocation: "50%" +--- +apiVersion: v1 +kind: ConfigMap +metadata: + ## Node-specific configuration can be specified with + #name: nri-resmgr-config.node. + namespace: kube-system +data: + policy: |+ + Active: topology-aware + AvailableResources: + cpu: cpuset:0-63 + ReservedResources: + cpu: cpuset:0-1 + topology-aware: + PinCPU: false + PinMemory: true + PreferIsolatedCPUs: false + PreferSharedCPUs: false + static: + RelaxedIsolation: true + static-pools: + # This is an example configuration for static-pools policy. + # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each. + pools: + exclusive: + # 6 exclusive cores, 3 on sockets 1, 2 and 3 each + cpuLists: + - Cpuset: 8,9 + Socket: 1 + - Cpuset: 10,11 + Socket: 1 + - Cpuset: 16,17 + Socket: 2 + - Cpuset: 18,19 + Socket: 2 + - Cpuset: 24,25 + Socket: 3 + - Cpuset: 26,27 + Socket: 3 + exclusive: true + shared: + # 2 cores in shared pool, all on socket 1 + cpuLists: + - Cpuset: 12,13,14,15 + Socket: 1 + exclusive: false + infra: + # Rest of cores designated to infra pool + cpuLists: + - Cpuset: 0,1,2,3,4,5,6,7 + Socket: 0 + - Cpuset: 20,21,22,23 + Socket: 2 + - Cpuset: 28,29,30,31 + Socket: 3 + exclusive: false + rdt: |+ + # Common options + options: + # One of Full, Discovery or Disabled + mode: Full + # Set to true to disable creation of monitoring groups + monitoringDisabled: false + l3: + # Make this false if L3 CAT must be available + optional: true + mb: + # Make this false if MBA must be available + optional: true + + # Configuration of classes + partitions: + exclusive: + # Allocate 60% of all L3 cache to the "exclusive" partition + l3Allocation: "60%" + mbAllocation: ["100%"] + classes: + Guaranteed: + # Allocate all of the partitions cache lines to "Guaranteed" + l3Allocation: "100%" + shared: + # Allocate 40% L3 cache IDs to the "shared" partition + # These will NOT overlap with the cache lines allocated for "exclusive" partition + l3Allocation: "40%" + mbAllocation: ["50%"] + classes: + Burstable: + # Allow "Burstable" to use all cache lines of the "shared" partition + l3Allocation: "100%" + BestEffort: + # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition. + # These will overlap with those used by "Burstable" + l3Allocation: "50%" + logger: |+ + Debug: resource-manager,cache diff --git a/sample-configs/topology-aware-policy.cfg b/sample-configs/topology-aware-policy.cfg new file mode 100644 index 000000000..65ae79427 --- /dev/null +++ b/sample-configs/topology-aware-policy.cfg @@ -0,0 +1,6 @@ +policy: + Active: topology-aware + ReservedResources: + CPU: 750m +logger: + Debug: nri-resmgr,resource-manager,cache diff --git a/test/e2e/run.sh b/test/e2e/run.sh index afab05085..159b7b0b5 100755 --- a/test/e2e/run.sh +++ b/test/e2e/run.sh @@ -77,6 +77,23 @@ if [ "$1" == "runtime-logs" ]; then exit fi +script_source="$(< "$0") $(< "$LIB_DIR/vm.bash")" + +help() { # script API + # Usage: help [FUNCTION|all] + # + # Print help on all functions or on the FUNCTION available in script. + awk -v f="$1" \ + '/^[a-z].*script API/{split($1,a,"(");if(f==""||f==a[1]||f=="all"){print "";print a[1]":";l=2}} + !/^ #/{l=l-1} + /^ #/{if(l>=1){split($0,a,"#"); print " "a[2]; if (f=="") l=0}}' <<<"$script_source" +} + +if [ "$1" == "help" ]; then + help + exit 0 +fi + echo echo " VM = $vm_name" echo " Distro = $distro"