diff --git a/CODE-OF-CONDUCT.md b/CODE-OF-CONDUCT.md
index ed820eeb3..c622543c8 100644
--- a/CODE-OF-CONDUCT.md
+++ b/CODE-OF-CONDUCT.md
@@ -1,3 +1,3 @@
-## The NRI Plugin Collection Project Community Code of Conduct
+# The NRI Plugins Project Community Code of Conduct
-The NRI Plugin Collection Project follows the [Containers Community Code of Conduct](https://github.com/containers/common/blob/main/CODE-OF-CONDUCT.md).
+The NRI Plugins Project follows the [Containers Community Code of Conduct](https://github.com/containers/common/blob/main/CODE-OF-CONDUCT.md).
diff --git a/Makefile b/Makefile
index ec1879ba5..9dee84432 100644
--- a/Makefile
+++ b/Makefile
@@ -91,6 +91,17 @@ LDFLAGS = \
-X=github.com/containers/nri-plugins/pkg/version.Build=$(BUILD_BUILDID) \
-B 0x$(RANDOM_ID)"
+# Documentation-related variables
+SPHINXOPTS ?= -W
+SPHINXBUILD = sphinx-build
+SITE_BUILDDIR ?= build/docs
+
+# Docker base command for working with html documentation.
+DOCKER_SITE_BUILDER_IMAGE := nri-plugins-site-builder
+DOCKER_SITE_CMD := $(DOCKER) run --rm -v "`pwd`:/docs" --user=`id -u`:`id -g` \
+ -p 8081:8081 \
+ -e SITE_BUILDDIR=$(SITE_BUILDDIR) -e SPHINXOPTS=$(SPHINXOPTS)
+
#
# top-level targets
#
@@ -329,3 +340,33 @@ report-licenses:
--ignore github.com/containers/nri-plugins \
> $(LICENSE_PATH)/licenses.csv && \
echo See $(LICENSE_PATH)/licenses.csv for license information
+
+#
+# Rules for documentation
+#
+
+html: clean-html
+ $(Q)BUILD_VERSION=$(BUILD_VERSION) \
+ $(SPHINXBUILD) -c docs . "$(SITE_BUILDDIR)" $(SPHINXOPTS)
+ cp docs/index.html "$(SITE_BUILDDIR)"
+ for d in $$(find docs -name figures -type d); do \
+ mkdir -p $(SITE_BUILDDIR)/$$d && cp $$d/* $(SITE_BUILDDIR)/$$d; \
+ done
+
+serve-html: html
+ $(Q)cd $(SITE_BUILDDIR) && python3 -m http.server 8081
+
+clean-html:
+ rm -rf $(SITE_BUILDDIR)
+
+site-build: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp
+ $(Q)$(DOCKER_SITE_CMD) $(DOCKER_SITE_BUILDER_IMAGE) make html
+
+site-serve: .$(DOCKER_SITE_BUILDER_IMAGE).image.stamp
+ $(Q)$(DOCKER_SITE_CMD) -it $(DOCKER_SITE_BUILDER_IMAGE) make serve-html
+
+.$(DOCKER_SITE_BUILDER_IMAGE).image.stamp: docs/Dockerfile docs/requirements.txt
+ docker build -t $(DOCKER_SITE_BUILDER_IMAGE) docs
+ touch $@
+
+docs: site-build
diff --git a/README.md b/README.md
index 2a048c0a3..ef6c74454 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# NRI Plugin Collection
+# NRI Plugins
This repository contains a collection of community maintained NRI plugins.
@@ -9,4 +9,4 @@ Currently following plugins are available:
| [Topology Aware][1] | resource policy |
| [Balloons][1] | resource policy |
-[1]: http://github.com/containers/nri-plugins/blob/main/docs/README-resource-policy.md
+[1]: http://github.com/containers/nri-plugins/blob/main/docs/resource-policy/README.md
diff --git a/SECURITY.md b/SECURITY.md
index 6d7c62b19..8b709c43c 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,4 +1,4 @@
-# Security and Disclosure Information Policy for the NRI Plugin Collection Project
+# Security and Disclosure Information Policy for the NRI Plugins Project
* [Reporting a Vulnerability](#Reporting-a-Vulnerability)
* [Security Announcements](#Security-Announcements)
@@ -6,7 +6,7 @@
## Reporting a Vulnerability
-If you think you've identified a security issue in a NRI Plugin Collection project,
+If you think you've identified a security issue in a NRI Plugins project,
please DO NOT report the issue publicly via the Github issue tracker,
mailing list, or IRC. Instead, send an email with as many details as
possible to [cncf-crio-security@lists.cncf.io](mailto:cncf-crio-security@lists.cncf.io?subject=Security%20Vunerablity%20Report) or [security@containerd.io](mailto:security@containerd.io?subject=Security%20Vunerablity%20Report).
diff --git a/docs/Dockerfile b/docs/Dockerfile
new file mode 100644
index 000000000..bc702fdcd
--- /dev/null
+++ b/docs/Dockerfile
@@ -0,0 +1,13 @@
+FROM sphinxdoc/sphinx:5.3.0
+
+RUN apt-get update && apt-get install -y wget git
+
+# Note: Any golang version that can 'go list -m -f {{.Variable}}' is fine...
+RUN wget https://go.dev/dl/go1.20.4.linux-amd64.tar.gz && \
+ tar -C /usr/local -xzf go1.20.4.linux-amd64.tar.gz
+
+ENV PATH=$PATH:/usr/local/go/bin
+
+COPY requirements.txt .
+
+RUN pip3 install -r requirements.txt
diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html
new file mode 100644
index 000000000..0e68b46ac
--- /dev/null
+++ b/docs/_templates/layout.html
@@ -0,0 +1,35 @@
+{%- extends "!layout.html" %}
+
+{% block footer %}
+ {% if versions_menu %}
+
+
+ GitHub Pages
+ {{ versions_menu_this_version }}
+
+
+
+
+ {% endif %}
+
+
+{% endblock %}
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 000000000..3ba21d146
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,282 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+from docutils import nodes
+from os.path import isdir, isfile, join, basename, dirname
+from os import makedirs, getenv
+from shutil import copyfile
+from subprocess import run, STDOUT
+
+# -- Project information -----------------------------------------------------
+
+project = 'NRI Plugins'
+copyright = '2023, various'
+author = 'various'
+
+master_doc = 'docs/index'
+
+
+##############################################################################
+#
+# This section determines the behavior of links to local items in .md files.
+#
+# if useGitHubURL == True:
+#
+# links to local files and directories will be turned into github URLs
+# using either the baseBranch defined here or using the commit SHA.
+#
+# if useGitHubURL == False:
+#
+# local files will be moved to the website directory structure when built
+# local directories will still be links to github URLs
+#
+# if built with GitHub workflows:
+#
+# the GitHub URLs will use the commit SHA (GITHUB_SHA environment variable
+# is defined by GitHub workflows) to link to the specific commit.
+#
+##############################################################################
+
+baseBranch = "main"
+useGitHubURL = True
+commitSHA = getenv('GITHUB_SHA')
+githubServerURL = getenv('GITHUB_SERVER_URL')
+githubRepository = getenv('GITHUB_REPOSITORY')
+if githubServerURL and githubRepository:
+ githubBaseURL = join(githubServerURL, githubRepository)
+else:
+ githubBaseURL = "https://github.com/containers/nri-plugins/"
+
+githubFileURL = join(githubBaseURL, "blob/")
+githubDirURL = join(githubBaseURL, "tree/")
+if commitSHA:
+ githubFileURL = join(githubFileURL, commitSHA)
+ githubDirURL = join(githubDirURL, commitSHA)
+else:
+ githubFileURL = join(githubFileURL, baseBranch)
+ githubDirURL = join(githubDirURL, baseBranch)
+
+# Version displayed in the upper left corner of the site
+ref = getenv('GITHUB_REF', default="")
+if ref == "refs/heads/main":
+ version = "devel"
+elif ref.startswith("refs/heads/release-"):
+ # For release branches just show the latest tag name
+ buildVersion = getenv("BUILD_VERSION", default="unknown")
+ version = buildVersion.split('-')[0]
+elif ref.startswith("refs/tags/"):
+ version = ref[len("refs/tags/"):]
+else:
+ version = getenv("BUILD_VERSION", default="unknown")
+
+release = getenv("BUILD_VERSION", default="unknown")
+
+# Versions to show in the version menu
+if getenv('VERSIONS_MENU'):
+ html_context = {
+ 'versions_menu': True,
+ 'versions_menu_this_version': getenv('VERSIONS_MENU_THIS_VERSION', version)}
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = ['myst_parser', 'sphinx_markdown_tables']
+myst_enable_extensions = ['substitution']
+source_suffix = {'.rst': 'restructuredtext','.md': 'markdown'}
+
+# Substitution variables
+def module_version(module, version):
+ version=version.split('-', 1)[0]
+ if module == 'github.com/intel/goresctrl':
+ version = '.'.join(version.split('.')[0:2]) + '.0'
+ return version
+
+def gomod_versions(modules):
+ versions = {}
+ gocmd = run(['go', 'list', '-m', '-f', '{{.GoVersion}}'],
+ check=True, capture_output=True, universal_newlines=True)
+ versions['golang'] = gocmd.stdout.strip()
+ for m in modules:
+ gocmd = run(['go', 'list', '-m', '-f', '{{.Version}}', '%s' % m],
+ check=True, capture_output=True, universal_newlines=True)
+ versions[m] = module_version(m, gocmd.stdout.strip())
+ return versions
+
+mod_versions = gomod_versions(['github.com/intel/goresctrl'])
+myst_substitutions = {
+ 'golang_version': mod_versions['golang'],
+ 'goresctrl_version': mod_versions['github.com/intel/goresctrl']
+}
+myst_heading_anchors = 3
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', '.github', '_work', 'generate', 'README.md', 'TODO.md', 'SECURITY.md', 'CODE-OF-CONDUCT.md', 'docs/releases', 'test/self-hosted-runner/README.md', 'test/e2e/README.md', 'docs/resource-policy/releases', 'docs/resource-policy/README.md','test/statistics-analysis/README.md']
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+html_theme_options = {
+ 'display_version': True,
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = ['_static']
+
+def setup(app):
+ app.connect('doctree-resolved',fixLocalMDAnchors)
+ app.connect('missing-reference',fixRSTLinkInMD)
+
+###############################################################################
+#
+# This section defines callbacks that make markdown specific tweaks to
+# either:
+#
+# 1. Fix something that recommonmark does wrong.
+# 2. Provide support for .md files that are written as READMEs in a GitHub
+# repo.
+#
+# Only use these changes if using the extension ``recommonmark``.
+#
+###############################################################################
+
+def isHTTPLink(uri):
+ return uri.startswith('http://') or uri.startswith('https://')
+
+def isMDFileLink(uri):
+ return uri.endswith('.md') or '.md#' in uri
+
+def isRSTFileLink(uri):
+ return uri.endswith('.rst')
+
+# Callback registerd with 'missing-reference'.
+def fixRSTLinkInMD(app, env, node, contnode):
+ refTarget = node.get('reftarget')
+
+ if isHTTPLink(refTarget):
+ return
+
+ if isRSTFileLink(refTarget) and not isHTTPLink(refTarget):
+ # This occurs when a .rst file is referenced from a .md file
+ # Currently unable to check if file exists as no file
+ # context is provided and links are relative.
+ #
+ # Example: [Application examples](examples/readme.rst)
+ #
+ contnode['refuri'] = contnode['refuri'].replace('.rst','.html')
+ contnode['internal'] = "True"
+ return contnode
+ elif refTarget.startswith("/"):
+ # This occurs when a file is referenced for download from an .md file.
+ # Construct a list of them and short-circuit the warning. The files
+ # are moved later (need file location context). To avoid warnings,
+ # write .md files, make the links absolute. This only marks them fixed
+ # if it can verify that they exist.
+ #
+ # Example: [Makefile](/Makefile)
+ #
+ filePath = refTarget.lstrip("/")
+ if isfile(filePath) or isdir(filePath):
+ return contnode
+
+
+def normalizePath(docPath,uriPath):
+ if uriPath == "":
+ return uriPath
+ if "#" in uriPath:
+ # Strip out anchors
+ uriPath = uriPath.split("#")[0]
+ if uriPath.startswith("/"):
+ # It's an absolute path
+ return uriPath.lstrip("/") #path to file from project directory
+ else:
+ # It's a relative path
+ docDir = dirname(docPath)
+ return join(docDir,uriPath) #path to file from referencing file
+
+
+# Callback registerd with 'doctree-resolved'.
+def fixLocalMDAnchors(app, doctree, docname):
+ for node in doctree.traverse(nodes.reference):
+ uri = node.get('refuri')
+
+ if isHTTPLink(uri):
+ continue
+
+ filePath = normalizePath(docname,uri)
+
+ if isfile(filePath):
+ # Only do this if the file exists.
+ #
+ # TODO: Pop a warning if the file doesn't exist.
+ #
+ if isMDFileLink(uri) and not isHTTPLink(uri):
+ # Make sure .md file links that weren't caught are converted.
+ # These occur when creating an explicit link to an .md file
+ # from an .rst file. By default these are not validated by Sphinx
+ # or recommonmark. Only toctree references are validated. recommonmark
+ # also fails to convert links to local Markdown files that include
+ # anchors. This fixes that as well.
+ #
+ # Only include this code if .md files are being converted to html
+ #
+ # Example: `Google Cloud Engine `__
+ # [configuration options](autotest.md#configuration-options)
+ #
+ node['refuri'] = node['refuri'].replace('.md','.html')
+ else:
+ # Handle the case where markdown is referencing local files in the repo
+ #
+ # Example: [Makefile](/Makefile)
+ #
+ if useGitHubURL:
+ # Replace references to local files with links to the GitHub repo
+ #
+ newURI = join(githubFileURL, filePath)
+ print("new url: ", newURI)
+ node['refuri']=newURI
+ else:
+ # If there are links to local files other than .md (.rst files are caught
+ # when warnings are fired), move the files into the Sphinx project, so
+ # they can be accessed.
+ newFileDir = join(app.outdir,dirname(filePath)) # where to move the file in Sphinx output.
+ newFilePath = join(app.outdir,filePath)
+ newURI = uri # if the path is relative no need to change it.
+ if uri.startswith("/"):
+ # It's an absolute path. Need to make it relative.
+ uri = uri.lstrip("/")
+ docDirDepth = len(docname.split("/")) - 1
+ newURI = "../"*docDirDepth + uri
+ if not isdir(newFileDir):
+ makedirs(newFileDir)
+ copyfile(filePath,newFilePath)
+ node['refuri'] = newURI
+ elif "#" not in uri: # ignore anchors
+ # turn links to directories into links to the repo
+ if isdir(filePath):
+ newURI = join(githubDirURL, filePath)
+ node['refuri']=newURI
diff --git a/docs/contributing.md b/docs/contributing.md
new file mode 100644
index 000000000..708eb515f
--- /dev/null
+++ b/docs/contributing.md
@@ -0,0 +1,10 @@
+# Contributing
+
+Please use the GitHub\* infrastructure for contributing to NRI Plugins.
+Use [pull requests](https://github.com/containers/nri-plugins/pulls)
+to contribute code, bug fixes, or if you want to discuss your ideas in terms of
+code. Open [issues](https://github.com/containers/nri-plugins/issues) to
+report bugs, request new features, or if you want to discuss any other topics
+related to NRI plugins.
+
+For the actual NRI (Node Resource Interface) API, please see [NRI repository](https://github.com/containerd/nri)
diff --git a/docs/index.html b/docs/index.html
new file mode 100644
index 000000000..50d6988fa
--- /dev/null
+++ b/docs/index.html
@@ -0,0 +1 @@
+
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 000000000..4fcefd194
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,14 @@
+.. NRI Plugins documentation master file
+
+Welcome to NRI Plugins documentation
+====================================
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ resource-policy/index.rst
+
+ contributing.md
+
+ Project GitHub repository
diff --git a/docs/releases/conf.py b/docs/releases/conf.py
new file mode 100644
index 000000000..ee64ee7ad
--- /dev/null
+++ b/docs/releases/conf.py
@@ -0,0 +1,76 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'NRI Plugins'
+copyright = '2023, various'
+author = 'various'
+
+# Versions to show in the version menu
+version = "all releases"
+if os.getenv('VERSIONS_MENU'):
+ html_context = {
+ 'versions_menu': True,
+ 'versions_menu_this_version': version}
+
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'myst_parser',
+ 'sphinx_markdown_tables'
+ ]
+source_suffix = {
+ '.rst': 'restructuredtext',
+ '.md': 'markdown'
+ }
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['../_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+html_theme_options = {
+ 'display_version': True,
+}
+
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = ['_static']
+
+# Callbacks for recommonmark
+def setup(app):
+ app.connect('missing-reference',ignoreMissingRefs)
+
+def ignoreMissingRefs(app, env, node, contnode):
+ return contnode
diff --git a/docs/releases/index.md b/docs/releases/index.md
new file mode 100644
index 000000000..edf183c77
--- /dev/null
+++ b/docs/releases/index.md
@@ -0,0 +1,21 @@
+# Releases
+
+For up-to-date user documentation see the [documentation site](/nri-plugins/resource-policy)
+
+## Documentation for Released Versions
+
+
+
+
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 000000000..329afe4b4
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,5 @@
+sphinx==5.3.0
+sphinx_rtd_theme
+myst-parser==0.18.1
+sphinx-markdown-tables
+Pygments==2.13.0
diff --git a/docs/README-resource-policy.md b/docs/resource-policy/README.md
similarity index 100%
rename from docs/README-resource-policy.md
rename to docs/resource-policy/README.md
diff --git a/docs/resource-policy/developers-guide/architecture.md b/docs/resource-policy/developers-guide/architecture.md
new file mode 100644
index 000000000..130df98e5
--- /dev/null
+++ b/docs/resource-policy/developers-guide/architecture.md
@@ -0,0 +1,205 @@
+# Architecture
+
+## Overview
+
+NRI Resource Policy (later NRI-RP) plugin is an add-on for controlling
+container resource allocation on Kubernetes nodes.
+
+NRI-RP plugs in to the NRI interface provided by container runtime implementation.
+The NRI-RP may alter the container resource allocation depending on
+configuration.
+
+NRI-RP keeps track of the states of all containers running on a Kubernetes
+node. Whenever it receives a NRI request that results in changes to the
+resource allocation of any container (container creation, deletion, or
+resource assignment update request), NRI-RP runs the built-in policy
+algorithm. This policy makes a decision about how the assignment of
+resources should be updated. The policy can make changes to any
+container in the system, not just the one associated with the received
+NRI request. NRI-RP's internal state tracking cache provides an abstraction
+for modifying containers and the policy uses this abstraction for recording its
+decisions.
+
+Many aspects for NRI-RP are configurable. These include, for instance,
+configuration of the resource assignment algorithm for the policy.
+
+Although NRI-RP can be configured using a static configuration file,
+the preferred way to configure all NRI-RP instances in a cluster is to use
+Kubernetes CRDs and ConfigMaps.
+
+
+
+
+
+
+## Components
+
+### [Node Agent](/pkg/resmgr/agent/)
+
+The node agent is a component internal to NRI-RP itself. All interactions
+by NRI-RP with the Kubernetes Control Plane go through the node agent with
+the node agent performing any direct interactions on behalf of NRI-RP.
+
+The agent interface implements the following functionality:
+ - push updated external configuration data to NRI-RP
+ - updating resource capacity of the node
+ - getting, setting, or removing labels on the node
+ - getting, setting, or removing annotations on the node
+ - getting, setting, or removing taints on the node
+
+The config interface is defined and has its gRPC server running in
+NRI-RP. The agent acts as a gRPC client for this interface. The low-level
+cluster interface is defined and has its gRPC server running in the agent,
+with the [convenience layer](/pkg/resmgr/agent) defined in NRI-RP.
+NRI-RP acts as a gRPC client for the low-level plumbing interface.
+
+Additionally, the stock node agent that comes with NRI-RP implements schemes
+for:
+ - configuration management for all NRI-RP instances
+ - management of dynamic adjustments to container resource assignments
+
+
+### [Resource Manager](/pkg/resmgr/)
+
+NRI-RP implements an event processing pipeline. In addition to NRI events,
+it processes a set of other events that are not directly related to or the
+result of NRI requests.
+These events are typically internally generated within NRI-RP. They can be
+the result of changes in the state of some containers or the utilization
+of a shared system resource, which potentially could warrant an attempt to
+rebalance the distribution of resources among containers to bring the system
+closer to an optimal state. Some events can also be generated by policies.
+
+The Resource Manager component of NRI-RP implements the basic control
+flow of the processing pipeline. It passes control to all the
+necessary sub-components of NRI-RP at the various phases of processing a
+request or an event. Additionally, it serializes the processing of these,
+making sure there is at most one request or event being processed at any
+point in time.
+
+The high-level control flow of the request processing pipeline is as
+follows:
+
+A. If the request does not need policying, let it bypass the processing
+pipeline; hand it off for logging, then relay it to the server and the
+corresponding response back to the client.
+
+B. If the request needs to be intercepted for policying, do the following:
+ 1. Lock the processing pipeline serialization lock.
+ 2. Look up/create cache objects (pod/container) for the request.
+ 3. If the request has no resource allocation consequences, do proxying
+ (step 6).
+ 4. Otherwise, invoke the policy layer for resource allocation:
+ - Pass it on to the configured active policy, which will
+ - Allocate resources for the container.
+ - Update the assignments for the container in the cache.
+ - Update any other containers affected by the allocation in the cache.
+ 5. Invoke the controller layer for post-policy processing, which will:
+ - Collect controllers with pending changes in their domain of control
+ - for each invoke the post-policy processing function corresponding to
+ the request.
+ - Clear pending markers for the controllers.
+ 6. Proxy the request:
+ - Relay the request to the server.
+ - Send update requests for any additional affected containers.
+ - Update the cache if/as necessary based on the response.
+ - Relay the response back to the client.
+ 7. Release the processing pipeline serialization lock.
+
+The high-level control flow of the event processing pipeline is one of the
+following, based on the event type:
+
+ - For policy-specific events:
+ 1. Engage the processing pipeline lock.
+ 2. Call policy event handler.
+ 3. Invoke the controller layer for post-policy processing (same as step 5 for requests).
+ 4. Release the pipeline lock.
+ - For metrics events:
+ 1. Perform collection/processing/correlation.
+ 2. Engage the processing pipeline lock.
+ 3. Update cache objects as/if necessary.
+ 4. Request rebalancing as/if necessary.
+ 5. Release pipeline lock.
+ - For rebalance events:
+ 1. Engage the processing pipeline lock.
+ 2. Invoke policy layer for rebalancing.
+ 3. Invoke the controller layer for post-policy processing (same as step 5 for requests).
+ 4. Release the pipeline lock.
+
+
+### [Cache](/pkg/resmgr/cache/)
+
+The cache is a shared internal storage location within NRI-RP. It tracks the
+runtime state of pods and containers known to NRI-RP, as well as the state
+of NRI-RP itself, including the active configuration and the state of the
+active policy. The cache is saved to permanent storage in the filesystem and
+is used to restore the runtime state of NRI-RP across restarts.
+
+The cache provides functions for querying and updating the state of pods and
+containers. This is the mechanism used by the active policy to make resource
+assignment decisions. The policy simply updates the state of the affected
+containers in the cache according to the decisions.
+
+The cache's ability to associate and track changes to containers with
+resource domains is used to enforce policy decisions. The generic controller
+layer first queries which containers have pending changes, then invokes each
+controller for each container. The controllers use the querying functions
+provided by the cache to decide if anything in their resource/control domain
+needs to be changed and then act accordingly.
+
+Access to the cache needs to be serialized. However, this serialization is
+not provided by the cache itself. Instead, it assumes callers to make sure
+proper protection is in place against concurrent read-write access. The
+request and event processing pipelines in the resource manager use a lock to
+serialize request and event processing and consequently access to the cache.
+
+If a policy needs to do processing unsolicited by the resource manager, IOW
+processing other than handling the internal policy backend API calls from the
+resource manager, then it should inject a policy event into the resource
+managers event loop. This causes a callback from the resource manager to
+the policy's event handler with the injected event as an argument and with
+the cache properly locked.
+
+
+### [Generic Policy Layer](/pkg/resmgr/policy/policy.go)
+
+The generic policy layer defines the abstract interface the rest of NRI-RP
+uses to interact with policy implementations and takes care of the details
+of activating and dispatching calls through to the configured active policy.
+
+
+### [Generic Resource Controller Layer](/pkg/resmgr/control/control.go)
+
+The generic resource controller layer defines the abstract interface the rest
+of NRI-RP uses to interact with resource controller implementations and takes
+care of the details of dispatching calls to the controller implementations
+for post-policy enforcment of decisions.
+
+
+### [Metrics Collector](/pkg/metrics/)
+
+The metrics collector gathers a set of runtime metrics about the containers
+running on the node. NRI-RP can be configured to periodically evaluate this
+collected data to determine how optimal the current assignment of container
+resources is and to attempt a rebalancing/reallocation if it is deemed
+both possible and necessary.
+
+
+### [Policy Implementations](/cmd/)
+
+#### [Topology Aware](/cmd/topology-aware/)
+
+A topology-aware policy capable of handling multiple tiers/types of memory,
+typically a DRAM/PMEM combination configured in 2-layer memory mode.
+
+#### [Balloons](/cmd/balloons/)
+
+A balloons policy allows user to define fine grained control how the
+computer resources are distributed to workloads.
+
+#### [Template](/cmd/template/)
+
+The template policy can be used as a base for developing new policies.
+It provides hooks that the policy developer can fill to define fine grained
+control how the computer resources are distributed to workloads.
+Do not edit the template policy directly but copy it to new name and edit that.
diff --git a/docs/resource-policy/developers-guide/e2e-test.md b/docs/resource-policy/developers-guide/e2e-test.md
new file mode 100644
index 000000000..06adb92f5
--- /dev/null
+++ b/docs/resource-policy/developers-guide/e2e-test.md
@@ -0,0 +1,118 @@
+# End-to-End tests
+
+## Prerequisites
+
+Install:
+- `docker`
+- `vagrant`
+
+## Usage
+
+Run policy tests:
+
+```
+cd test/e2e
+[VAR=VALUE...] ./run_tests.sh policies.test-suite
+```
+
+Run tests only on certain policy, topology, or only selected test:
+
+```
+cd test/e2e
+[VAR=VALUE...] ./run_tests.sh policies.test-suite[/POLICY[/TOPOLOGY[/testNN-*]]]
+```
+
+Get help on available `VAR=VALUE`'s with `./run.sh help`.
+`run_tests.sh` calls `run.sh` in order to execute selected tests.
+Therefore the same `VAR=VALUE` definitions apply both scripts.
+
+## Test phases
+
+In the *setup phase* `run.sh` creates a virtual machine unless it
+already exists. When it is running, tests create a single-node cluster
+and deploy `nri-resource-policy` DaemonSet on it.
+
+In the *test phase* `run.sh` runs a test script. *Test scripts* are
+`bash` scripts that can use helper functions for running commands and
+observing the status of the virtual machine and software running on it.
+
+In the *tear down phase* `run.sh` copies logs from the virtual machine
+and finally stops or deletes the virtual machine, if that is wanted.
+
+## Test modes
+
+- `test` mode runs fast and reports `Test verdict: PASS` or
+ `FAIL`. The exit status is zero if and only if a test passed.
+
+Currently only the normal test mode is supported.
+
+## Running from scratch and quick rerun in existing virtual machine
+
+The test will use `vagrant`-managed virtual machine named in the
+`vm_name` environment variable. The default name is constructed
+from used topology, Linux distribution and runtime name.
+If a virtual machine already exists, the test will be run on it.
+Otherwise the test will create a virtual machine from scratch.
+You can delete a virtual machine by going to the VM directory and
+giving the command `make destroy`.
+
+## Custom topologies
+
+If you change NUMA node topology of an existing virtual machine, you
+must delete the virtual machine first. Otherwise the `topology` variable
+is ignored and the test will run in the existing NUMA
+configuration.
+
+The `topology` variable is a JSON array of objects. Each object
+defines one or more NUMA nodes. Keys in objects:
+```
+"mem" mem (RAM) size on each NUMA node in this group.
+ The default is "0G".
+"nvmem" nvmem (non-volatile RAM) size on each NUMA node
+ in this group. The default is "0G".
+"cores" number of CPU cores on each NUMA node in this group.
+ The default is 0.
+"threads" number of threads on each CPU core.
+ The default is 2.
+"nodes" number of NUMA nodes on each die.
+ The default is 1.
+"dies" number of dies on each package.
+ The default is 1.
+"packages" number of packages.
+ The default is 1.
+```
+
+
+Example:
+
+Run the test in a VM with two NUMA nodes. There are 4 CPUs (two cores, two
+threads per core by default) and 4G RAM in each node
+```
+e2e$ vm_name=my2x4 topology='[{"mem":"4G","cores":2,"nodes":2}]' ./run.sh
+```
+
+Run the test in a VM with 32 CPUs in total: there are two packages
+(sockets) in the system, each containing two dies. Each die containing
+two NUMA nodes, each node containing 2 CPU cores, each core containing
+two threads. And with a NUMA node with 16G of non-volatile memory
+(NVRAM) but no CPUs.
+
+```
+e2e$ vm_name=mynvram topology='[{"mem":"4G","cores":2,"nodes":2,"dies":2,"packages":2},{"nvmem":"16G"}]' ./run.sh
+```
+
+## Test output
+
+All test output is saved under the directory in the environment
+variable `outdir` if the `run.sh` script is executed as is. The default
+output directory in this case is `./output`.
+
+For the standard e2e-tests run by `run_tests.sh`, the output directory
+is constructed from used Linux distribution, container runtime name and
+the used machine topology.
+For example `n4c16-generic-fedora37-containerd` output directory would
+indicate four node and 16 CPU system, running with Fedora 37 and having
+containerd as a container runtime.
+
+Executed commands with their output, exit status and timestamps are
+saved under the `output/commands` directory.
diff --git a/docs/resource-policy/developers-guide/figures/nri-resource-policy.png b/docs/resource-policy/developers-guide/figures/nri-resource-policy.png
new file mode 100644
index 000000000..fdb385c4e
Binary files /dev/null and b/docs/resource-policy/developers-guide/figures/nri-resource-policy.png differ
diff --git a/docs/resource-policy/developers-guide/index.rst b/docs/resource-policy/developers-guide/index.rst
new file mode 100644
index 000000000..dc83815f6
--- /dev/null
+++ b/docs/resource-policy/developers-guide/index.rst
@@ -0,0 +1,7 @@
+Developer's Guide
+#################
+.. toctree::
+ :maxdepth: 1
+
+ architecture.md
+ testing.rst
diff --git a/docs/resource-policy/developers-guide/testing.rst b/docs/resource-policy/developers-guide/testing.rst
new file mode 100644
index 000000000..3d0a9e30c
--- /dev/null
+++ b/docs/resource-policy/developers-guide/testing.rst
@@ -0,0 +1,8 @@
+Testing
+#######
+
+.. toctree::
+ :maxdepth: 1
+
+ unit-test.md
+ e2e-test.md
diff --git a/docs/resource-policy/developers-guide/unit-test.md b/docs/resource-policy/developers-guide/unit-test.md
new file mode 100644
index 000000000..752a11f23
--- /dev/null
+++ b/docs/resource-policy/developers-guide/unit-test.md
@@ -0,0 +1,7 @@
+# Unit tests
+
+Run unit tests with
+```
+make test
+```
+
diff --git a/docs/resource-policy/index.rst b/docs/resource-policy/index.rst
new file mode 100644
index 000000000..2fcb51f6c
--- /dev/null
+++ b/docs/resource-policy/index.rst
@@ -0,0 +1,16 @@
+.. NRI Resource Policy documentation master file
+
+Resource Policy Plugin
+======================
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ introduction.md
+ installation.md
+ setup.md
+ policy/index.rst
+ node-agent.md
+
+ developers-guide/index.rst
diff --git a/docs/resource-policy/installation.md b/docs/resource-policy/installation.md
new file mode 100644
index 000000000..25267fe2b
--- /dev/null
+++ b/docs/resource-policy/installation.md
@@ -0,0 +1 @@
+# Installation
diff --git a/docs/resource-policy/introduction.md b/docs/resource-policy/introduction.md
new file mode 100644
index 000000000..9c5e47260
--- /dev/null
+++ b/docs/resource-policy/introduction.md
@@ -0,0 +1,12 @@
+# Introduction
+
+NRI Resource Policy is a NRI container runtime plugin. It is connected
+to Container Runtime implementation (containerd, cri-o) via NRI API.
+The main purpose of the the NRI resource plugin is to apply hardware-aware
+resource allocation policies to the containers running in the system.
+
+There are different policies available, each with a different set of
+goals in mind and implementing different hardware allocation strategies. The
+details of whether and how a container resource request is altered or
+if extra actions are performed depend on which policy plugin is running
+and how that policy is configured.
diff --git a/docs/resource-policy/node-agent.md b/docs/resource-policy/node-agent.md
new file mode 100644
index 000000000..ea7a57840
--- /dev/null
+++ b/docs/resource-policy/node-agent.md
@@ -0,0 +1,27 @@
+# Dynamic Configuration
+
+NRI Resource Policy plugin can be configured dynamically using ConfigMaps.
+
+The plugin daemon monitors two ConfigMaps for the node, a primary node-specific one
+and a secondary group-specific or default one, depending on whether the node
+belongs to a configuration group. The node-specific ConfigMap always takes
+precedence over the others.
+
+The names of these ConfigMaps are
+
+1. `nri-resource-policy-config.node.$NODE_NAME`: primary, node-specific configuration
+2. `nri-resource-policy-config.group.$GROUP_NAME`: secondary group-specific node
+ configuration
+3. `nri-resource-policy-config.default`: secondary: secondary default node
+ configuration
+
+You can assign a node to a configuration group by setting the
+`resource-policy.nri.io/group` label on the node to the name of
+the configuration group. You can remove a node from its group by deleting
+the node group label.
+
+There is a
+[sample ConfigMap spec](/sample-configs/nri-resource-policy-configmap.example.yaml)
+that contains a node-specific, a group-specific, and a default ConfigMap
+example. See [any available policy-specific documentation](policy/index.rst)
+for more information on the policy configurations.
diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md
new file mode 100644
index 000000000..f6a9c80da
--- /dev/null
+++ b/docs/resource-policy/policy/balloons.md
@@ -0,0 +1,223 @@
+# Balloons Policy
+
+## Overview
+
+The balloons policy implements workload placement into "balloons" that
+are disjoint CPU pools. Balloons can be inflated and deflated, that is
+CPUs added and removed, based on the CPU resource requests of
+containers. Balloons can be static or dynamically created and
+destroyed. CPUs in balloons can be configured, for example, by setting
+min and max frequencies on CPU cores and uncore.
+
+## How It Works
+
+1. User configures balloon types from which the policy instantiates
+ balloons.
+
+2. A balloon has a set of CPUs and a set of containers that run on the
+ CPUs.
+
+3. Every container is assigned to exactly one balloon. A container is
+ allowed to use all CPUs of its balloon and no other CPUs.
+
+4. Every logical CPU belongs to at most one balloon. There can be CPUs
+ that do not belong to any balloon.
+
+5. The number of CPUs in a balloon can change during the lifetime of
+ the balloon. If a balloon inflates, that is CPUs are added to it,
+ all containers in the balloon are allowed to use more CPUs. If a
+ balloon deflates, the opposite is true.
+
+6. When a new container is created on a Kubernetes node, the policy
+ first decides the type of the balloon that will run the
+ container. The decision is based on annotations of the pod, or the
+ namespace if annotations are not given.
+
+7. Next the policy decides which balloon of the decided type will run
+ the container. Options are:
+ - an existing balloon that already has enough CPUs to run its
+ current and new containers
+ - an existing balloon that can be inflated to fit its current and
+ new containers
+ - new balloon.
+
+9. When a CPU is added to a balloon or removed from it, the CPU is
+ reconfigured based on balloon's CPU class attributes, or idle CPU
+ class attributes.
+
+## Deployment
+
+Deploy nri-resource-policy-balloons on each node as you would for any
+other policy. See [installation](../installation.md) for more details.
+
+## Configuration
+
+The balloons policy is configured using the yaml-based configuration
+system of nri-resource-policy.
+See [setup and usage](../setup.md#setting-up-nri-resource-policy) for
+more details on managing the configuration.
+
+### Parameters
+
+Balloons policy parameters:
+
+- `PinCPU` controls pinning a container to CPUs of its balloon. The
+ default is `true`: the container cannot use other CPUs.
+- `PinMemory` controls pinning a container to the memories that are
+ closest to the CPUs of its balloon. The default is `true`: allow
+ using memory only from the closest NUMA nodes. Warning: this may
+ cause kernel to kill workloads due to out-of-memory error when
+ closest NUMA nodes do not have enough memory. In this situation
+ consider switching this option `false`.
+- `IdleCPUClass` specifies the CPU class of those CPUs that do not
+ belong to any balloon.
+- `ReservedPoolNamespaces` is a list of namespaces (wildcards allowed)
+ that are assigned to the special reserved balloon, that is, will run
+ on reserved CPUs. This always includes the `kube-system` namespace.
+- `AllocatorTopologyBalancing` affects selecting CPUs for new
+ balloons. If `true`, new balloons are created using CPUs on
+ NUMA/die/package with most free CPUs, that is, balloons are spread
+ across the hardware topology. This helps inflating balloons within
+ the same NUMA/die/package and reduces interference between workloads
+ in balloons when system is not fully loaded. The default is `false`:
+ pack new balloons tightly into the same NUMAs/dies/packages. This
+ helps keeping large portions of hardware idle and entering into deep
+ power saving states.
+- `BalloonTypes` is a list of balloon type definitions. Each type can
+ be configured with the following parameters:
+ - `Name` of the balloon type. This is used in pod annotations to
+ assign containers to balloons of this type.
+ - `Namespaces` is a list of namespaces (wildcards allowed) whose
+ pods should be assigned to this balloon type, unless overridden by
+ pod annotations.
+ - `MinBalloons` is the minimum number of balloons of this type that
+ is always present, even if the balloons would not have any
+ containers. The default is 0: if a balloon has no containers, it
+ can be destroyed.
+ - `MaxBalloons` is the maximum number of balloons of this type that
+ is allowed to co-exist. The default is 0: creating new balloons is
+ not limited by the number of existing balloons.
+ - `MaxCPUs` specifies the maximum number of CPUs in any balloon of
+ this type. Balloons will not be inflated larger than this. 0 means
+ unlimited.
+ - `MinCPUs` specifies the minimum number of CPUs in any balloon of
+ this type. When a balloon is created or deflated, it will always
+ have at least this many CPUs, even if containers in the balloon
+ request less.
+ - `CpuClass` specifies the name of the CPU class according to which
+ CPUs of balloons are configured.
+ - `PreferSpreadingPods`: if `true`, containers of the same pod
+ should be spread to different balloons of this type. The default
+ is `false`: prefer placing containers of the same pod to the same
+ balloon(s).
+ - `PreferPerNamespaceBalloon`: if `true`, containers in the same
+ namespace will be placed in the same balloon(s). On the other
+ hand, containers in different namespaces are preferrably placed in
+ different balloons. The default is `false`: namespace has no
+ effect on choosing the balloon of this type.
+ - `PreferNewBalloons`: if `true`, prefer creating new balloons over
+ placing containers to existing balloons. This results in
+ preferring exclusive CPUs, as long as there are enough free
+ CPUs. The default is `false`: prefer filling and inflating
+ existing balloons over creating new ones.
+ - `ShareIdleCPUsInSame`: Whenever the number of or sizes of balloons
+ change, idle CPUs (that do not belong to any balloon) are reshared
+ as extra CPUs to workloads in balloons with this option. The value
+ sets locality of allowed extra CPUs that will be common to these
+ workloads.
+ - `system`: workloads are allowed to use idle CPUs available
+ anywhere in the system.
+ - `package`: ...allowed to use idle CPUs in the same package(s)
+ (sockets) as the balloon.
+ - `die`: ...in the same die(s) as the balloon.
+ - `numa`: ...in the same numa node(s) as the balloon.
+ - `core`: ...allowed to use idle CPU threads in the same cores with
+ the balloon.
+ - `AllocatorPriority` (0: High, 1: Normal, 2: Low, 3: None). CPU
+ allocator parameter, used when creating new or resizing existing
+ balloons. If there are balloon types with pre-created balloons
+ (`MinBalloons` > 0), balloons of the type with the highest
+ `AllocatorPriority` are created first.
+
+Related configuration parameters:
+- `policy.ReservedResources.CPU` specifies the (number of) CPUs in the
+ special `reserved` balloon. By default all containers in the
+ `kube-system` namespace are assigned to the reserved balloon.
+- `cpu.classes` defines CPU classes and their parameters (such as
+ `minFreq`, `maxFreq`, `uncoreMinFreq` and `uncoreMaxFreq`).
+
+### Example
+
+Example configuration that runs all pods in balloons of 1-4 CPUs.
+```yaml
+policy:
+ Active: balloons
+ ReservedResources:
+ CPU: 1
+ balloons:
+ PinCPU: true
+ PinMemory: true
+ IdleCPUClass: lowpower
+ BalloonTypes:
+ - Name: "quad"
+ MinCpus: 1
+ MaxCPUs: 4
+ CPUClass: dynamic
+ Namespaces:
+ - "*"
+cpu:
+ classes:
+ lowpower:
+ minFreq: 800
+ maxFreq: 800
+ dynamic:
+ minFreq: 800
+ maxFreq: 3600
+ turbo:
+ minFreq: 3000
+ maxFreq: 3600
+ uncoreMinFreq: 2000
+ uncoreMaxFreq: 2400
+```
+
+See the [sample configmap](/sample-configs/balloons-policy.cfg) for a
+complete example.
+
+## Assigning a Container to a Balloon
+
+The balloon type of a container can be defined in pod annotations. In
+the example below, the first annotation sets the balloon type (`BT`)
+of a single container (`CONTAINER_NAME`). The last two annotations set
+the default balloon type for all containers in the pod.
+
+```yaml
+balloon.balloons.resource-policy.nri.io/container.CONTAINER_NAME: BT
+balloon.balloons.resource-policy.nri.io/pod: BT
+balloon.balloons.resource-policy.nri.io: BT
+```
+
+If a pod has no annotations, its namespace is matched to the
+`Namespaces` of balloon types. The first matching balloon type is
+used.
+
+If the namespace does not match, the container is assigned to the
+special `default` balloon, that means reserved CPUs unless `MinCPUs`
+or `MaxCPUs` of the `default` balloon type are explicitely defined in
+the `BalloonTypes` configuration.
+
+## Metrics and Debugging
+
+In order to enable more verbose logging and metrics exporting from the
+balloons policy, enable instrumentation and policy debugging from the
+nri-resource-policy global config:
+
+```yaml
+instrumentation:
+ # The balloons policy exports containers running in each balloon,
+ # and cpusets of balloons. Accessible in command line:
+ # curl --silent http://localhost:8891/metrics
+ HTTPEndpoint: :8891
+ PrometheusExport: true
+logger:
+ Debug: policy
+```
diff --git a/docs/resource-policy/policy/container-affinity.md b/docs/resource-policy/policy/container-affinity.md
new file mode 100644
index 000000000..d4a1ab2e2
--- /dev/null
+++ b/docs/resource-policy/policy/container-affinity.md
@@ -0,0 +1,267 @@
+# Container Affinity and Anti-Affinity
+
+## Introduction
+
+The topology-aware resource policy allow the user to give hints about how
+particular containers should be *co-located* within a node. In particular these
+hints express whether containers should be located *'close'* to each other or
+*'far away'* from each other, in a hardware topology sense.
+
+Since these hints are interpreted always by a particular *policy implementation*,
+the exact definitions of 'close' and 'far' are also somewhat *policy-specific*.
+However as a general rule of thumb containers running
+
+ - on CPUs within the *same NUMA nodes* are considered *'close'* to each other,
+ - on CPUs within *different NUMA nodes* in the *same socket* are *'farther'*, and
+ - on CPUs within *different sockets* are *'far'* from each other
+
+These hints are expressed by `container affinity annotations` on the Pod.
+There are two types of affinities:
+
+ - `affinity` (or `positive affinty`): cause affected containers to *pull* each other closer
+ - `anti-affinity` (or `negative affinity`): cause affected containers to *push* each other further away
+
+Policies try to place a container
+ - close to those the container has affinity towards
+ - far from those the container has anti-affinity towards.
+
+## Affinity Annotation Syntax
+
+*Affinities* are defined as the `resource-policy.nri.io/affinity` annotation.
+*Anti-affinities* are defined as the `resource-manager.nri.io/anti-affinity`
+annotation. They are specified in the `metadata` section of the `Pod YAML`, under
+`annotations` as a dictionary, with each dictionary key being the name of the
+*container* within the Pod to which the annotation belongs to.
+
+```yaml
+metadata:
+ anotations:
+ resource-manager.nri.io/affinity: |
+ container1:
+ - scope:
+ key: key-ref
+ operator: op
+ values:
+ - value1
+ ...
+ - valueN
+ match:
+ key: key-ref
+ operator: op
+ values:
+ - value1
+ ...
+ - valueN
+ weight: w
+```
+
+An anti-affinity is defined similarly but using `resource-manager.nri.io/anti-affinity`
+as the annotation key.
+
+```yaml
+metadata:
+ anotations:
+ resource-manager.nri.io/anti-affinity: |
+ container1:
+ - scope:
+ key: key-ref
+ operator: op
+ values:
+ - value1
+ ...
+ - valueN
+ match:
+ key: key-ref
+ operator: op
+ values:
+ - value1
+ ...
+ - valueN
+ weight: w
+```
+
+## Affinity Semantics
+
+An affinity consists of three parts:
+
+ - `scope expression`: defines which containers this affinity is evaluated against
+ - `match expression`: defines for which containers (within the scope) the affinity applies to
+ - `weight`: defines how *strong* a pull or a push the affinity causes
+
+*Affinities* are also sometimes referred to as *positive affinities* while
+*anti-affinities* are referred to as *negative affinities*. The reason for this is
+that the only difference between these are that affinities have a *positive weight*
+while anti-affinities have a *negative weight*.
+
+The *scope* of an affinity defines the *bounding set of containers* the affinity can
+apply to. The affinity *expression* is evaluated against the containers *in scope* and
+it *selects the containers* the affinity really has an effect on. The *weight* specifies
+whether the effect is a *pull* or a *push*. *Positive* weights cause a *pull* while
+*negative* weights cause a *push*. Additionally, the *weight* specifies *how strong* the
+push or the pull is. This is useful in situations where the policy needs to make some
+compromises because an optimal placement is not possible. The weight then also acts as
+a way to specify preferences of priorities between the various compromises: the heavier
+the weight the stronger the pull or push and the larger the propbability that it will be
+honored, if this is possible at all.
+
+The scope can be omitted from an affinity in which case it implies *Pod scope*, in other
+words the scope of all containers that belong to the same Pod as the container for which
+which the affinity is defined.
+
+The weight can also be omitted in which case it defaults to -1 for anti-affinities
+and +1 for affinities. Weights are currently limited to the range [-1000,1000].
+
+Both the affinity scope and the expression select containers, therefore they are identical.
+Both of them are *expressions*. An expression consists of three parts:
+
+ - key: specifies what *metadata* to pick from a container for evaluation
+ - operation (op): specifies what *logical operation* the expression evaluates
+ - values: a set of *strings* to evaluate the the value of the key against
+
+The supported keys are:
+
+ - for pods:
+ - `name`
+ - `namespace`
+ - `qosclass`
+ - `labels/`
+ - `id`
+ - `uid`
+ - for containers:
+ - `pod/`
+ - `name`
+ - `namespace`
+ - `qosclass`
+ - `labels/`
+ - `tags/`
+ - `id`
+
+Essentially an expression defines a logical operation of the form (key op values).
+Evaluating this logical expression will take the value of the key in which
+either evaluates to true or false.
+a boolean true/false result. Currently the following operations are supported:
+
+ - `Equals`: equality, true if the *value of key* equals the single item in *values*
+ - `NotEqual`: inequality, true if the *value of key* is not equal to the single item in *values*
+ - `In`: membership, true if *value of key* equals to any among *values*
+ - `NotIn`: negated membership, true if the *value of key* is not equal to any among *values*
+ - `Exists`: true if the given *key* exists with any value
+ - `NotExists`: true if the given *key* does not exist
+ - `AlwaysTrue`: always evaluates to true, can be used to denote node-global scope (all containers)
+ - `Matches`: true if the *value of key* matches the globbing pattern in values
+ - `MatchesNot`: true if the *value of key* does not match the globbing pattern in values
+ - `MatchesAny`: true if the *value of key* matches any of the globbing patterns in values
+ - `MatchesNone`: true if the *value of key* does not match any of the globbing patterns in values
+
+The effective affinity between containers C_1 and C_2, A(C_1, C_2) is the sum of the
+weights of all pairwise in-scope matching affinities W(C_1, C_2). To put it another way,
+evaluating an affinity for a container C_1 is done by first using the scope (expression)
+to determine which containers are in the scope of the affinity. Then, for each in-scope
+container C_2 for which the match expression evaluates to true, taking the weight of the
+affinity and adding it to the effective affinity A(C_1, C_2).
+
+Note that currently (for the topology-aware policy) this evaluation is asymmetric:
+A(C_1, C_2) and A(C_2, C_1) can and will be different unless the affinity annotations are
+crafted to prevent this (by making them fully symmetric). Moreover, A(C_1, C_2) is calculated
+and taken into consideration during resource allocation for C_1, while A(C_2, C_1)
+is calculated and taken into account during resource allocation for C_2. This might be
+changed in a future version.
+
+
+Currently affinity expressions lack support for boolean operators (and, or, not).
+Sometimes this limitation can be overcome by using joint keys, especially with
+matching operators. The joint key syntax allows joining the value of several keys
+with a separator into a single value. A joint key can be specified in a simple or
+full format:
+
+ - simple: ``, this is equivalent to `:::`
+ - full: ``
+
+A joint key evaluates to the values of all the ``-separated subkeys joined by ``.
+A non-existent subkey evaluates to the empty string. For instance the joint key
+
+ `:pod/qosclass:pod/name:name`
+
+evaluates to
+
+ `::`
+
+For existence operators, a joint key is considered to exist if any of its subkeys exists.
+
+
+## Examples
+
+Put the container `peter` close to the container `sheep` but far away from the
+container `wolf`.
+
+```yaml
+metadata:
+ annotations:
+ resource-manager.nri.io/affinity: |
+ peter:
+ - match:
+ key: name
+ operator: Equals
+ values:
+ - sheep
+ weight: 5
+ resource-manager.nri.io/anti-affinity: |
+ peter:
+ - match:
+ key: name
+ operator: Equals
+ values:
+ - wolf
+ weight: 5
+```
+
+## Shorthand Notation
+
+There is an alternative shorthand syntax for what is considered to be the most common
+case: defining affinities between containers within the same pod. With this notation
+one needs to give just the names of the containers, like in the example below.
+
+```yaml
+ annotations:
+ resource-manager.nri.io/affinity: |
+ container3: [ container1 ]
+ resource-manager.nri.io/anti-affinity: |
+ container3: [ container2 ]
+ container4: [ container2, container3 ]
+```
+
+
+This shorthand notation defines:
+ - `container3` having
+ - affinity (weight 1) to `container1`
+ - `anti-affinity` (weight -1) to `container2`
+ - `container4` having
+ - `anti-affinity` (weight -1) to `container2`, and `container3`
+
+The equivalent annotation in full syntax would be
+
+```yaml
+metadata:
+ annotations:
+ resource-manager.nri.io/affinity: |+
+ container3:
+ - match:
+ key: labels/io.kubernetes.container.name
+ operator: In
+ values:
+ - container1
+ resource-manager.nri.io/anti-affinity: |+
+ container3:
+ - match:
+ key: labels/io.kubernetes.container.name
+ operator: In
+ values:
+ - container2
+ container4:
+ - match:
+ key: labels/io.kubernetes.container.name
+ operator: In
+ values:
+ - container2
+ - container3
+```
diff --git a/docs/resource-policy/policy/cpu-allocator.md b/docs/resource-policy/policy/cpu-allocator.md
new file mode 100644
index 000000000..8d7eb0419
--- /dev/null
+++ b/docs/resource-policy/policy/cpu-allocator.md
@@ -0,0 +1,61 @@
+# CPU Allocator
+
+NRI Resource Policy has a separate CPU allocator component that helps policies
+make educated allocation of CPU cores for workloads. Currently all policies
+utilize the built-in CPU allocator. See policy specific documentation for more
+details.
+
+## Topology Based Allocation
+
+The CPU allocator tries to optimize the allocation of CPUs in terms of the
+hardware topology. More specifically, it aims at packing all CPUs of one
+request "near" each other in order to minimize memory latencies between CPUs.
+
+## CPU Prioritization
+
+The CPU allocator also does automatic CPU prioritization by detecting CPU
+features and their configuration parameters. Currently, NRI Resource Policy
+supports CPU priority detection based on the `intel_pstate` scaling
+driver in the Linux CPUFreq subsystem, and, Intel Speed Select Technology
+(SST).
+
+CPUs are divided into three priority classes, i.e. *high*, *normal* and *low*.
+Policies utilizing the CPU allocator may choose to prefer certain priority
+class for certain types of workloads. For example, prefer (and preserve) high
+priority CPUs for high priority workloads.
+
+### Intel Speed Select Technology (SST)
+
+NRI Resource Policy supports detection of all Intel Speed Select Technology
+(SST) features, i.e. Speed Select Technology Performance Profile (SST-PP), Base
+Frequency (SST-BF), Turbo Frequency (SST-TF) and Core Power (SST-CP).
+
+CPU prioritization is based on detection of the currently active SST features
+and their parameterization:
+
+1. If SST-TF has been enabled, all CPUs prioritized by SST-TF are flagged as
+ high priority.
+1. If SST-CP is enabled but SST-TF disabled, the CPU allocator examines the
+ active Classes of Service (CLOSes) and their parameters. CPUs associated
+ with the highest priority CLOS will be flagged as high priority, lowest
+ priority CLOS will be flagged as low priority and possible "middle priority"
+ CLOS as normal priority.
+1. If SST-BF has been enabled and SST-TF and SST-CP are inactive, all BF high
+ priority cores (having higher guaranteed base frequency) will be flagged
+ as high priority.
+
+### Linux CPUFreq
+
+CPUFreq based prioritization only takes effect if Intel Speed Select Technology
+(SST) is disabled (or not supported). NRI-RM divides CPU cores into priority
+classes based on two parameters:
+
+- base frequency
+- EPP (Energy-Performance Preference)
+
+CPU cores with high base frequency (relative to the other cores in the system)
+will be flagged as high priority. Low base frequency will map to low priority,
+correspondingly.
+
+CPU cores with high EPP priority (relative to the other cores in the system)
+will be marked as high priority cores.
diff --git a/docs/resource-policy/policy/index.rst b/docs/resource-policy/policy/index.rst
new file mode 100644
index 000000000..35bd500de
--- /dev/null
+++ b/docs/resource-policy/policy/index.rst
@@ -0,0 +1,10 @@
+Policies
+########
+
+.. toctree::
+ :maxdepth: 1
+
+ topology-aware.md
+ balloons.md
+ container-affinity.md
+ cpu-allocator.md
diff --git a/docs/resource-policy/policy/topology-aware.md b/docs/resource-policy/policy/topology-aware.md
new file mode 100644
index 000000000..9904e891d
--- /dev/null
+++ b/docs/resource-policy/policy/topology-aware.md
@@ -0,0 +1,432 @@
+# Topology-Aware Policy
+
+## Background
+
+On server-grade hardware the CPU cores, I/O devices and other peripherals
+form a rather complex network together with the memory controllers, the
+I/O bus hierarchy and the CPU interconnect. When a combination of these
+resources are allocated to a single workload, the performance of that
+workload can vary greatly, depending on how efficiently data is transferred
+between them or, in other words, on how well the resources are aligned.
+
+There are a number of inherent architectural hardware properties that,
+unless properly taken into account, can cause resource misalignment and
+workload performance degradation. There are a multitude of CPU cores
+available to run workloads. There are a multitude of memory controllers
+these workloads can use to store and retrieve data from main memory. There
+are a multitude of I/O devices attached to a number of I/O buses the same
+workloads can access. The CPU cores can be divided into a number of groups,
+with each group having different access latency and bandwidth to each
+memory controller and I/O device.
+
+If a workload is not assigned to run with a properly aligned set of CPU,
+memory and devices, it will not be able to achieve optimal performance.
+Given the idiosyncrasies of hardware, allocating a properly aligned set
+of resources for optimal workload performance requires identifying and
+understanding the multiple dimensions of access latency locality present
+in hardware or, in other words, hardware topology awareness.
+
+## Overview
+
+The `topology-aware` policy automatically builds a tree of pools based on the
+detected hardware topology. Each pool has a set of CPUs and memory zones
+assigned as their resources. Resource allocation for workloads happens by
+first picking the pool which is considered to fit the best the resource
+requirements of the workload and then assigning CPU and memory from this pool.
+
+The pool nodes at various depths from bottom to top represent the NUMA nodes,
+dies, sockets, and finally the whole of the system at the root node. Leaf NUMA
+nodes are assigned the memory behind their controllers / zones and CPU cores
+with the smallest distance / access penalty to this memory. If the machine
+has multiple types of memory separately visible to both the kernel and user
+space, for instance both DRAM and
+[PMEM](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html),
+each zone of special type of memory is assigned to the closest NUMA node pool.
+
+Each non-leaf pool node in the tree is assigned the union of the resources of
+its children. So in practice, dies nodes end up containing all the CPU cores
+and the memory zones in the corresponding die, sockets nodes end up containing
+the CPU cores and memory zones in the corresponding socket's dies, and the root
+ends up containing all CPU cores and memory zones in all sockets.
+
+With this setup, each pool in the tree has a topologically aligned set of CPU
+and memory resources. The amount of available resources gradually increases in
+the tree from bottom to top, while the strictness of alignment is gradually
+relaxed. In other words, as one moves from bottom to top in the tree, it is
+getting gradually easier to fit in a workload, but the price paid for this is
+a gradually increasing maximum potential cost or penalty for memory access and
+data transfer between CPU cores.
+
+Another property of this setup is that the resource sets of sibling pools at
+the same depth in the tree are disjoint while the resource sets of descendant
+pools along the same path in the tree partially overlap, with the intersection
+decreasing as the the distance between pools increases. This makes it easy to
+isolate workloads from each other. As long as workloads are assigned to pools
+which has no other common ancestor than the root, the resources of these
+workloads should be as well isolated from each other as possible on the given
+hardware.
+
+With such an arrangement, this policy should handle topology-aware alignment
+of resources without any special or extra configuration. When allocating
+resources, the policy
+
+ - filters out all pools with insufficient free capacity
+ - runs a scoring algorithm for the remaining ones
+ - picks the one with the best score
+ - assigns resources to the workload from there
+
+Although the details of the scoring algorithm are subject to change as the
+implementation evolves, its basic principles are roughly
+
+ - prefer pools lower in the tree, IOW stricter alignment and lower latency
+ - prefer idle pools over busy ones, IOW more remaining free capacity and
+ fewer workloads
+ - prefer pools with better overall device alignment
+
+## Features
+
+The `topology-aware` policy has the following features:
+
+ - topologically aligned allocation of CPU and memory
+ * assign CPU and memory to workloads with tightest available alignment
+ - aligned allocation of devices
+ * pick pool for workload based on locality of devices already assigned
+ - shared allocation of CPU cores
+ * assign workload to shared subset of pool CPUs
+ - exclusive allocation of CPU cores
+ * dynamically slice off CPU cores from shared subset and assign to workload
+ - mixed allocation of CPU cores
+ * assign both exclusive and shared CPU cores to workload
+ - discovering and using kernel-isolated CPU cores (['isolcpus'](https://www.kernel.org/doc/html/latest/admin-guide/kernel-parameters.html#cpu-lists))
+ * use kernel-isolated CPU cores for exclusively assigned CPU cores
+ - exposing assigned resources to workloads
+ - notifying workloads about changes in resource assignment
+ - dynamic relaxation of memory alignment to prevent OOM
+ * dynamically widen workload memory set to avoid pool/workload OOM
+ - multi-tier memory allocation
+ * assign workloads to memory zones of their preferred type
+ * the policy knows about three kinds of memory:
+ - DRAM is regular system main memory
+ - PMEM is large-capacity memory, such as
+ [IntelĀ® Optaneā¢ memory](https://www.intel.com/content/www/us/en/products/memory-storage/optane-dc-persistent-memory.html)
+ - [HBM](https://en.wikipedia.org/wiki/High_Bandwidth_Memory) is high speed memory,
+ typically found on some special-purpose computing systems
+ - cold start
+ * pin workload exclusively to PMEM for an initial warm-up period
+ - dynamic page demotion
+ * forcibly migrate read-only and idle container memory pages to PMEM
+
+## Activating the Policy
+
+You can activate the `topology-aware` policy by using the following configuration
+fragment in the configuration for `nri-resource-policy-topology-aware`:
+
+```yaml
+policy:
+ Active: topology-aware
+ ReservedResources:
+ CPU: 750m
+```
+
+## Configuring the Policy
+
+The policy has a number of configuration options which affect its default behavior.
+These options can be supplied as part of the
+[dynamic configuration](../setup.md#using-nri-resource-policy-agent-and-a-configmap)
+received via the [`node agent`](../node-agent.md), or in a fallback or forced
+configuration file. These configuration options are
+
+ - `PinCPU`
+ * whether to pin workloads to assigned pool CPU sets
+ - `PinMemory`
+ * whether to pin workloads to assigned pool memory zones
+ - `PreferIsolatedCPUs`
+ * whether isolated CPUs are preferred by default for workloads that are
+ eligible for exclusive CPU allocation
+ - `PreferSharedCPUs`
+ * whether shared allocation is preferred by default for workloads that
+ would be otherwise eligible for exclusive CPU allocation
+ - `ReservedPoolNamespaces`
+ * list of extra namespaces (or glob patters) that will be allocated to reserved CPUs
+ - `ColocatePods`
+ * whether try to allocate containers in a pod to the same or close by topology pools
+ - `ColocateNamespaces`
+ * whether try to allocate containers in a namespace to the same or close by topology pools
+
+## Policy CPU Allocation Preferences
+
+There are a number of workload properties this policy actively checks to decide
+if the workload could potentially benefit from extra resource allocation
+optimizations. Unless configured differently, containers fulfilling certain
+corresponding criteria are considered eligible for these optimizations. This
+will be reflected in the assigned resources whenever that is possible at the
+time the container's creation / resource allocation request hits the policy.
+
+The set of these extra optimizations consist of
+
+ - assignment of `kube-reserved` CPUs
+ - assignment of exclusively allocated CPU cores
+ - usage of kernel-isolated CPU cores (for exclusive allocation)
+
+The policy uses a combination of the QoS class and the resource requirements of
+the container to decide if any of these extra allocation preferences should be
+applied. Containers are divided into five groups, with each group having a
+slightly different set of criteria for eligibility.
+
+ - `kube-system` group
+ * all containers in the `kube-system` namespace
+ - `low-priority` group
+ * containers in the `BestEffort` or `Burstable` QoS class
+ - `sub-core` group
+ * Guaranteed QoS class containers with `CPU request < 1 CPU`
+ - `mixed` group
+ * Guaranteed QoS class containers with `1 <= CPU request < 2`
+ - `multi-core` group
+ * Guaranteed QoS class containers with `CPU request >= 2`
+
+The eligibility rules for extra optimization are slightly different among these
+groups.
+
+ - `kube-system`
+ * not eligible for extra optimizations
+ * eligible to run on `kube-reserved` CPU cores
+ * always run on shared CPU cores
+ - `low-priority`
+ * not eligible for extra optimizations
+ * always run on shared CPU cores
+ - `sub-core`
+ * not eligible for extra optimizations
+ * always run on shared CPU cores
+ - `mixed`
+ * by default eligible for exclusive and isolated allocation
+ * not eligible for either if `PreferSharedCPUs` is set to true
+ * not eligible for either if annotated to opt out from exclusive allocation
+ * not eligible for isolated allocation if annotated to opt out
+ - `multi-core`
+ * CPU request fractional (`(CPU request % 1000 milli-CPU) != 0`):
+ - by default not eligible for extra optimizations
+ - eligible for exclusive and isolated allocation if annotated to opt in
+ * CPU request not fractional:
+ - by default eligible for exclusive allocation
+ - by default not eligible for isolated allocation
+ - not eligible for exclusive allocation if annotated to opt out
+ - eligible for isolated allocation if annotated to opt in
+
+Eligibility for kube-reserved CPU core allocation should always be possible to
+honor. If this is not the case, it is probably due to an incorrect configuration
+which underdeclares `ReservedResources`. In that case, ordinary shared CPU cores
+will be used instead of kube-reserved ones.
+
+Eligibility for exclusive CPU allocation should always be possible to honor.
+Eligibility for isolated core allocation is only honored if there are enough
+isolated cores available to fulfill the exclusive part of the container's CPU
+request with isolated cores alone. Otherwise ordinary CPUs will be allocated,
+by slicing them off for exclusive usage from the shared subset of CPU cores in
+the container's assigned pool.
+
+Containers in the kube-system group are pinned to share all kube-reserved CPU
+cores. Containers in the low-priority or sub-core groups, and containers which
+are only eligible for shared CPU core allocation in the mixed and multi-core
+groups, are all pinned to run on the shared subset of CPU cores in the
+container's assigned pool. This shared subset can and usually does change
+dynamically as exclusive CPU cores are allocated and released in the pool.
+
+## Container CPU Allocation Preference Annotations
+
+Containers can be annotated to diverge from the default CPU allocation
+preferences the policy would otherwise apply to them. These Pod annotations
+can be given both with per pod and per container resolution. If for any
+container both of these exist, the container-specific one takes precedence.
+
+### Shared, Exclusive, and Isolated CPU Preference
+
+A container can opt in to or opt out from shared CPU allocation using the
+following Pod annotation.
+
+```yaml
+metadata:
+ annotations:
+ # opt in container C1 to shared CPU core allocation
+ prefer-shared-cpus.resource-policy.nri.io/container.C1: "true"
+ # opt in the whole pod to shared CPU core allocation
+ prefer-shared-cpus.resource-policy.nri.io/pod: "true"
+ # selectively opt out container C2 from shared CPU core allocation
+ prefer-shared-cpus.resource-policy.nri.io/container.C2: "false"
+```
+
+Opting in to exclusive allocation happens by opting out from shared allocation,
+and opting out from exclusive allocation happens by opting in to shared
+allocation.
+
+A container can opt in to or opt out from isolated exclusive CPU core
+allocation using the following Pod annotation.
+
+```yaml
+metadata:
+ annotations:
+ # opt in container C1 to isolated exclusive CPU core allocation
+ prefer-isolated-cpus.resource-policy.nri.io/container.C1: "true"
+ # opt in the whole pod to isolated exclusive CPU core allocation
+ prefer-isolated-cpus.resource-policy.nri.io/pod: "true"
+ # selectively opt out container C2 from isolated exclusive CPU core allocation
+ prefer-isolated-cpus.resource-policy.nri.io/container.C2: "false"
+```
+
+These Pod annotations have no effect on containers which are not eligible for
+exclusive allocation.
+
+### Implicit Hardware Topology Hints
+
+`NRI Resource Policy` automatically generates HW `Topology Hints` for devices
+assigned to a container, prior to handing the container off to the active policy
+for resource allocation. The `topology-aware` policy is hint-aware and normally
+takes topology hints into account when picking the best pool to allocate resources.
+Hints indicate optimal `HW locality` for device access and they can alter
+significantly which pool gets picked for a container.
+
+Since device topology hints are implicitly generated, there are cases where one
+would like the policy to disregard them altogether. For instance, when a local
+volume is used by a container but not in any performance critical manner.
+
+Containers can be annotated to opt out from and selectively opt in to hint-aware
+pool selection using the following Pod annotations.
+
+```yaml
+metadata:
+ annotations:
+ # only disregard hints for container C1
+ topologyhints.resource-policy.nri.io/container.C1: "false"
+ # disregard hints for all containers by default
+ topologyhints.resource-policy.nri.io/pod: "false"
+ # but take hints into account for container C2
+ topologyhints.resource-policy.nri.io/container.C2: "true"
+```
+
+Topology hint generation is globally enabled by default. Therefore, using the
+Pod annotation as opt in only has an effect when the whole pod is annotated to
+opt out from hint-aware pool selection.
+
+### Implicit Topological Co-location for Pods and Namespaces
+
+The `ColocatePods` or `ColocateNamespaces` configuration options control whether
+the policy will try to co-locate, that is allocate topologically close, containers
+within the same Pod or K8s namespace.
+
+Both of these options are false by default. Setting them to true is a shorthand
+for adding to each container an affinity of weight 10 for all other containers
+in the same pod or namespace.
+
+Containers with user-defined affinities are never extended with either of these
+co-location affinities. However, such containers can still have affinity effects
+on other containers that do get extended with co-location. Therefore mixing user-
+defined affinities with implicit co-location requires both careful consideration
+and a thorough understanding of affinity evaluation, or it should be avoided
+altogether.
+
+## Cold Start
+
+The `topology-aware` policy supports "cold start" functionality. When cold start
+is enabled and the workload is allocated to a topology node with both DRAM and
+PMEM memory, the initial memory controller is only the PMEM controller. DRAM
+controller is added to the workload only after the cold start timeout is
+done. The effect of this is that allocated large unused memory areas of
+memory don't need to be migrated to PMEM, because it was allocated there to
+begin with. Cold start is configured like this in the pod metadata:
+
+```yaml
+metadata:
+ annotations:
+ memory-type.resource-policy.nri.io/container.container1: dram,pmem
+ cold-start.resource-policy.nri.io/container.container1: |
+ duration: 60s
+```
+
+Again, alternatively you can use the following deprecated Pod annotation syntax
+to achieve the same, but support for this syntax is subject to be dropped in a
+future release:
+
+```yaml
+metadata:
+ annotations:
+ resource-policy.nri.io/memory-type: |
+ container1: dram,pmem
+ resource-policy.nri.io/cold-start: |
+ container1:
+ duration: 60s
+```
+
+In the above example, `container1` would be initially granted only PMEM
+memory controller, but after 60 seconds the DRAM controller would be
+added to the container memset.
+
+## Dynamic Page Demotion
+
+The `topology-aware` policy also supports dynamic page demotion. With dynamic
+demotion enabled, rarely-used pages are periodically moved from DRAM to PMEM
+for those workloads which are assigned to use both DRAM and PMEM memory types.
+The configuration for this feature is done using three configuration keys:
+`DirtyBitScanPeriod`, `PageMovePeriod`, and `PageMoveCount`. All of these
+parameters need to be set to non-zero values in order for dynamic page demotion
+to get enabled. See this configuration file fragment as an example:
+
+```yaml
+policy:
+ Active: topology-aware
+ topology-aware:
+ DirtyBitScanPeriod: 10s
+ PageMovePeriod: 2s
+ PageMoveCount: 1000
+```
+
+In this setup, every pid in every container in every non-system pod
+fulfilling the memory container requirements would have their page ranges
+scanned for non-accessed pages every ten seconds. The result of the scan
+would be fed to a page-moving loop, which would attempt to move 1000 pages
+every two seconds from DRAM to PMEM.
+
+## Container memory requests and limits
+
+Due to inaccuracies in how `nri-resource-policy` calculates memory requests for
+pods in QoS class `Burstable`, you should either use `Limit` for setting
+the amount of memory for containers in `Burstable` pods to provide `cri-resmgr`
+with an exact copy of the resource requirements from the Pod Spec as an extra
+Pod annotation.
+
+## Reserved pool namespaces
+
+User is able to mark certain namespaces to have a reserved CPU allocation.
+Containers belonging to such namespaces will only run on CPUs set aside
+according to the global CPU reservation, as configured by the ReservedResources
+configuration option in the policy section.
+The `ReservedPoolNamespaces` option is a list of namespace globs that will be
+allocated to reserved CPU class.
+
+For example:
+
+```yaml
+policy:
+ Active: topology-aware
+ topology-aware:
+ ReservedPoolNamespaces: ["my-pool","reserved-*"]
+```
+
+In this setup, all the workloads in `my-pool` namespace and those namespaces
+starting with `reserved-` string are allocated to reserved CPU class.
+The workloads in `kube-system` are automatically assigned to reserved CPU
+class so no need to mention `kube-system` in this list.
+
+## Reserved CPU annotations
+
+User is able to mark certain pods and containers to have a reserved CPU allocation
+by using annotations. Containers having a such annotation will only run on CPUs set
+aside according to the global CPU reservation, as configured by the ReservedResources
+configuration option in the policy section.
+
+For example:
+
+```yaml
+metadata:
+ annotations:
+ prefer-reserved-cpus.resource-policy.nri.io/pod: "true"
+ prefer-reserved-cpus.resource-policy.nri.io/container.special: "false"
+```
diff --git a/docs/resource-policy/setup.md b/docs/resource-policy/setup.md
new file mode 100644
index 000000000..6eee35be9
--- /dev/null
+++ b/docs/resource-policy/setup.md
@@ -0,0 +1,61 @@
+# Setup and Usage
+
+When you want to try NRI Resource Policy, here is the list of things
+you need to do, assuming you already have a Kubernetes\* cluster up and
+running, using either `containerd` or `cri-o` as the runtime.
+
+ * [Install](installation.md) NRI Resource Policy DaemonSet deployment file.
+ * Runtime (containerd / cri-o) configuration
+
+For NRI Resource Policy, you need to provide a configuration file. The default
+configuration ConfigMap file can be found in the DaemonSet deployment yaml file.
+You can edit it as needed.
+
+**NOTE**: Currently, the available policies are a work in progress.
+
+## Setting up NRI Resource Policy
+
+### Using NRI Resource Policy Agent and a ConfigMap
+
+The [NRI Resource Policy Node Agent][agent] can monitor and fetch configuration
+from the ConfigMap and pass it on to NRI Resource Policy plugin.
+By default, it automatically tries to use the agent to acquire configuration,
+unless you override this by forcing a static local configuration using
+the `--force-config ` option.
+When using the agent, it is also possible to provide an initial fallback for
+configuration using the `--fallback-config `. This file is
+used before the very first configuration is successfully acquired from the
+agent.
+
+See the [Node Agent][agent] about how to set up and configure the agent.
+
+
+## Logging and debugging
+
+You can control logging with the klog command line options or by setting the
+corresponding environment variables. You can get the name of the environment
+variable for a command line option by prepending the `LOGGER_` prefix to the
+capitalized option name without any leading dashes. For instance, setting the
+environment variable `LOGGER_SKIP_HEADERS=true` has the same effect as using
+the `-skip_headers` command line option.
+
+Additionally, the `LOGGER_DEBUG` environment variable controls debug logs.
+These are globally disabled by default. You can turn on full debugging by
+setting `LOGGER_DEBUG='*'`.
+
+When using environment variables, be careful which configuration you pass to
+NRI Resource Policy using a file or ConfigMap. The environment is treated
+as default configuration but a file or a ConfigMap has higher precedence.
+If something is configured in both, the environment will only be in effect
+until the configuration is applied. However, in such a case if you later
+push an updated configuration to NRI Resource Policy with the overlapping
+settings removed, the original ones from the environment will be in effect
+again.
+
+For debug logs, the settings from the configuration are applied in addition
+to any settings in the environment. That said, if you turn something on in
+the environment but off in the configuration, it will be turned off
+eventually.
+
+
+[agent]: node-agent.md
diff --git a/sample-configs/balloons-policy.cfg b/sample-configs/balloons-policy.cfg
new file mode 100644
index 000000000..8ad17682b
--- /dev/null
+++ b/sample-configs/balloons-policy.cfg
@@ -0,0 +1,94 @@
+policy:
+ Active: balloons
+ # Use only 15 CPUs in total, leave cpu0 for other than Kubernetes
+ # processes.
+ AvailableResources:
+ CPU: cpuset:1-15
+ # Reserve one of our CPUs (cpu15) for kube-system tasks.
+ ReservedResources:
+ CPU: cpuset:15
+ balloons:
+ # PinCPU: allow containers to use only the CPUs in their balloons.
+ PinCPU: true
+ # PinMemory: allow containers to use only the closest memory to
+ # the CPUs in their balloons.
+ PinMemory: true
+ # IdleCPUClass: how to configure CPUs that are not included in any
+ # of the balloons.
+ IdleCPUClass: idle
+ BalloonTypes:
+ - Name: "full-core-turbo"
+ # MinCPUs: minimum number of logical cores in every balloon
+ # instance of this type.
+ # The default is 0.
+ MinCPUs: 2
+ # MaxCPUs: maximum number of logical cores in every balloon
+ # instance of this type.
+ # The default is 0 (unlimited).
+ MaxCPUs: 2
+ # CPUClass: how to configure CPUs of these balloons.
+ # The default is "".
+ CPUClass: "turbo"
+ # Namespaces: assign pods in listed namespaces to these
+ # balloons, even if there is no explicit annotation:
+ # balloon.balloons.nri-resmgr.intel.com: full-core-turbo
+ # The default is to assign only annotated pods.
+ Namespaces:
+ - "highperf"
+ # AllocatorPriotity: CPU allocator priority (0: High, 1:
+ # Normal, 2: Low, 3: None). Affects the performance/type of
+ # CPUs that are selected into the balloon. CPUs for static
+ # balloon instances (MinBalloons > 0) with highest
+ # AllocatorPriority are reserved first.
+ # The default is 0.
+ AllocatorPriority: 2
+ # MinBalloons: how many balloon instances of this type are always
+ # kept in the system, even if there would not be workloads to them.
+ # The default is 0.
+ MinBalloons: 2
+ # PreferNewBalloons: prefer creating a new balloon for
+ # separate pods, even if their CPU requirements would allow
+ # putting them in the same balloon.
+ # The default is: false.
+ PreferNewBalloons: true
+ # PreferPerNamespaceBalloon: if true, containers in the same
+ # namespace are preferrably placed in the same balloon, and
+ # containers in different namespaces to different
+ # balloons. The default is false: namespaces have no effect on
+ # placement.
+ PreferPerNamespaceBalloon: false
+ # PreferSpreadingPods: if true, containers of single pod can
+ # be assigned in different balloons, based on which balloons
+ # have most free CPU resources.
+ # The default is: false: prefer running containers of a same
+ # pod in the same balloon(s).
+ PreferSpreadingPods: false
+
+ - Name: "socket-size"
+ MaxCPUs: 8
+ AllocatorPriority: 2
+ Namespaces:
+ - "default"
+ CPUClass: "normal"
+# CPU controller configuration specifies CPU class properties. CPUs of
+# each balloon are configured based on its CPUClass. If a balloon has
+# no CPUClass, the properties of the default class are applied.
+cpu:
+ classes:
+ default:
+ minFreq: 800
+ maxFreq: 1600
+ turbo:
+ minFreq: 3300
+ maxFreq: 3600
+ normal:
+ minFreq: 800
+ maxFreq: 2400
+instrumentation:
+ # The balloons policy exports containers running in each balloon,
+ # and cpusets of balloons. Accessible in command line:
+ # curl --silent http://localhost:8891/metrics
+ HTTPEndpoint: :8891
+ PrometheusExport: true
+logger:
+ Debug: policy
diff --git a/sample-configs/blockio.cfg b/sample-configs/blockio.cfg
new file mode 100644
index 000000000..9236e01a0
--- /dev/null
+++ b/sample-configs/blockio.cfg
@@ -0,0 +1,64 @@
+# This configuration demonstrates how to configure cgroups block io
+# controller for pods.
+#
+# The configuration defines block device parameters for three blockio
+# classes (LowPrioThrottled, HighPrioFullSpeed and Default, feel free
+# to choose any names here). Finally resource-manager.blockio maps QOS
+# classes BestEffort, Burstable (via wildcard), and Guaranteed to
+# these classes.
+#
+# Try with: nri-resource-policy-topology-aware -force-config blockio.cfg
+
+logger:
+ Debug: blockio,cgroupblkio
+
+blockio:
+ Classes:
+ # LowPrioThrottled and HighPrioFullSpeed are user-defined blockio classes
+ # in this example. Pods and containers can be assigned to these classes using Pod
+ # metadata annotations. For example in Pod yaml:
+ # ...
+ # metadata:
+ # annotations:
+ # # Default blockio class for containers in the pod:
+ # blockioclass.cri-resource-manager.intel.com/pod: LowPrioThrottled
+ # # Special blockio class for a container in the pod:
+ # blockioclass.cri-resource-manager.intel.com/container.mycontainer: HighPrioFullSpeed
+ LowPrioThrottled:
+ # Default io-scheduler weight for all devices that are not
+ # explicitly mentioned in following items.
+ - Weight: 80 # will be written to cgroups(.bfq).weight
+
+ # Configuration for all virtio and scsi block devices.
+ - Devices:
+ - /dev/vd*
+ - /dev/sd*
+ ThrottleReadBps: 50M # max read bytes per second
+ ThrottleWriteBps: 10M # max write bytes per second
+ ThrottleReadIOPS: 10k # max read io operations per second
+ ThrottleWriteIOPS: 5k # max write io operations per second
+ Weight: 50 # io-scheduler (cfq/bfq) weight for these devices,
+ # will be written to cgroups(.bfq).weight_device
+
+ # Configuration for SSD devices.
+ # This overrides above configuration for those /dev/sd* devices
+ # whose disk id contains "SSD"
+ - Devices:
+ - /dev/disk/by-id/*SSD*
+ ThrottleReadBps: 100M
+ ThrottleWriteBps: 40M
+ # Not mentioning Throttle*IOPS means no io operations throttling for matching devices.
+ Weight: 50
+
+ HighPrioFullSpeed:
+ - Weight: 400
+
+ # When Pod annotations do not define blockio class, QoS class
+ # names (BestEffort, Burstable, Guaranteed) are used as blockio
+ # class names for the pod. By default no blockio configuration
+ # takes place for them, but here we define I/O scheduler weight
+ # difference:
+ BestEffort:
+ - Weight: 90
+ Guaranteed:
+ - Weight: 200
diff --git a/sample-configs/nri-resource-policy-configmap.example.yaml b/sample-configs/nri-resource-policy-configmap.example.yaml
new file mode 100644
index 000000000..c40f32733
--- /dev/null
+++ b/sample-configs/nri-resource-policy-configmap.example.yaml
@@ -0,0 +1,304 @@
+#
+# This example creates 3 ConfigMaps:
+# - nri-resmgr-config.default: the default configuration
+# - nri-resmgr-config.group.foo: the configuration for nodes in group foo
+# - nri-resmgr-config.node.cl0-slave1: the configuration for node cl0-slave1
+#
+# You can assign nodes to group foo using the command
+# kubectl label --overwrite node $NODE_NAME nri-resmgr.intel.com/group=foo
+#
+# You can remove nodes from group foo using the command
+# kubectl label node $NODE_NAME nri-resmgr.intel.com/group-
+#
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: nri-resmgr-config.default
+ namespace: kube-system
+data:
+ policy: |+
+ Active: topology-aware
+ AvailableResources:
+ cpu: cpuset:0-63
+ ReservedResources:
+ cpu: cpuset:0-1
+ topology-aware:
+ PinCPU: true
+ PinMemory: true
+ PreferIsolatedCPUs: true
+ PreferSharedCPUs: false
+ static:
+ RelaxedIsolation: true
+ static-pools:
+ # Filesystem path to legacy configuration directory structure
+ ConfDirPath: "/etc/cmk"
+ # Filesystem path to legacy configuration file
+ ConfFilePath: ""
+ # Whether to create CMK node label
+ LabelNode: false
+ # Whether to create CMK node taint
+ TaintNode: false
+ # Pool configuration.
+ # The imaginary example system below consists of 4 sockets, 4 cores, 2
+ # threads each.
+ pools:
+ exclusive:
+ # 6 exclusive cores, 3 on sockets 1, 2 and 3 each
+ cpuLists:
+ - Cpuset: 8,9
+ Socket: 1
+ - Cpuset: 10,11
+ Socket: 1
+ - Cpuset: 16,17
+ Socket: 2
+ - Cpuset: 18,19
+ Socket: 2
+ - Cpuset: 24,25
+ Socket: 3
+ - Cpuset: 26,27
+ Socket: 3
+ exclusive: true
+ shared:
+ # 2 cores in shared pool, all on socket 1
+ cpuLists:
+ - Cpuset: 12,13,14,15
+ Socket: 1
+ exclusive: false
+ infra:
+ # Rest of cores designated to infra pool
+ cpuLists:
+ - Cpuset: 0,1,2,3,4,5,6,7
+ Socket: 0
+ - Cpuset: 20,21,22,23
+ Socket: 2
+ - Cpuset: 28,29,30,31
+ Socket: 3
+ exclusive: false
+ rdt: |+
+ # Common options
+ options:
+ # One of Full, Discovery or Disabled
+ mode: Full
+ # Set to true to disable creation of monitoring groups
+ monitoringDisabled: false
+ l3:
+ # Make this false if L3 CAT must be available
+ optional: true
+ mb:
+ # Make this false if MBA must be available
+ optional: true
+
+ # Configuration of classes
+ partitions:
+ exclusive:
+ # Allocate 60% of all L3 cache to the "exclusive" partition
+ l3Allocation: "60%"
+ mbAllocation: ["100%"]
+ classes:
+ Guaranteed:
+ # Allocate all of the partitions cache lines to "Guaranteed"
+ l3Allocation: "100%"
+ shared:
+ # Allocate 40% L3 cache IDs to the "shared" partition
+ # These will NOT overlap with the cache lines allocated for "exclusive" partition
+ l3Allocation: "40%"
+ mbAllocation: ["50%"]
+ classes:
+ Burstable:
+ # Allow "Burstable" to use all cache lines of the "shared" partition
+ l3Allocation: "100%"
+ BestEffort:
+ # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition.
+ # These will overlap with those used by "Burstable"
+ l3Allocation: "50%"
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ ## Configuration specific to a group of nodes can be specified with
+ #name: nri-resmgr-config.group.
+ namespace: kube-system
+data:
+ policy: |+
+ Active: topology-aware
+ AvailableResources:
+ cpu: cpuset:0-63
+ ReservedResources:
+ cpu: cpuset:0-1
+ topology-aware:
+ PinCPU: true
+ PinMemory: false
+ PreferIsolatedCPUs: false
+ PreferSharedCPUs: false
+ static:
+ RelaxedIsolation: true
+ static-pools:
+ # This is an example configuration for static-pools policy.
+ # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each.
+ pools:
+ exclusive:
+ # 6 exclusive cores, 3 on sockets 1, 2 and 3 each
+ cpuLists:
+ - Cpuset: 8,9
+ Socket: 1
+ - Cpuset: 10,11
+ Socket: 1
+ - Cpuset: 16,17
+ Socket: 2
+ - Cpuset: 18,19
+ Socket: 2
+ - Cpuset: 24,25
+ Socket: 3
+ - Cpuset: 26,27
+ Socket: 3
+ exclusive: true
+ shared:
+ # 2 cores in shared pool, all on socket 1
+ cpuLists:
+ - Cpuset: 12,13,14,15
+ Socket: 1
+ exclusive: false
+ infra:
+ # Rest of cores designated to infra pool
+ cpuLists:
+ - Cpuset: 0,1,2,3,4,5,6,7
+ Socket: 0
+ - Cpuset: 20,21,22,23
+ Socket: 2
+ - Cpuset: 28,29,30,31
+ Socket: 3
+ exclusive: false
+ rdt: |+
+ # Common options
+ options:
+ # One of Full, Discovery or Disabled
+ mode: Full
+ # Set to true to disable creation of monitoring groups
+ monitoringDisabled: false
+ l3:
+ # Make this false if L3 CAT must be available
+ optional: true
+ mb:
+ # Make this false if MBA must be available
+ optional: true
+
+ # Configuration of classes
+ partitions:
+ exclusive:
+ # Allocate 60% of all L3 cache to the "exclusive" partition
+ l3Allocation: "60%"
+ mbAllocation: ["100%"]
+ classes:
+ Guaranteed:
+ # Allocate all of the partitions cache lines to "Guaranteed"
+ l3Allocation: "100%"
+ shared:
+ # Allocate 40% L3 cache IDs to the "shared" partition
+ # These will NOT overlap with the cache lines allocated for "exclusive" partition
+ l3Allocation: "40%"
+ mbAllocation: ["50%"]
+ classes:
+ Burstable:
+ # Allow "Burstable" to use all cache lines of the "shared" partition
+ l3Allocation: "100%"
+ BestEffort:
+ # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition.
+ # These will overlap with those used by "Burstable"
+ l3Allocation: "50%"
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ ## Node-specific configuration can be specified with
+ #name: nri-resmgr-config.node.
+ namespace: kube-system
+data:
+ policy: |+
+ Active: topology-aware
+ AvailableResources:
+ cpu: cpuset:0-63
+ ReservedResources:
+ cpu: cpuset:0-1
+ topology-aware:
+ PinCPU: false
+ PinMemory: true
+ PreferIsolatedCPUs: false
+ PreferSharedCPUs: false
+ static:
+ RelaxedIsolation: true
+ static-pools:
+ # This is an example configuration for static-pools policy.
+ # The imaginary example system here consists of 4 sockets, 4 cores, 2 threads each.
+ pools:
+ exclusive:
+ # 6 exclusive cores, 3 on sockets 1, 2 and 3 each
+ cpuLists:
+ - Cpuset: 8,9
+ Socket: 1
+ - Cpuset: 10,11
+ Socket: 1
+ - Cpuset: 16,17
+ Socket: 2
+ - Cpuset: 18,19
+ Socket: 2
+ - Cpuset: 24,25
+ Socket: 3
+ - Cpuset: 26,27
+ Socket: 3
+ exclusive: true
+ shared:
+ # 2 cores in shared pool, all on socket 1
+ cpuLists:
+ - Cpuset: 12,13,14,15
+ Socket: 1
+ exclusive: false
+ infra:
+ # Rest of cores designated to infra pool
+ cpuLists:
+ - Cpuset: 0,1,2,3,4,5,6,7
+ Socket: 0
+ - Cpuset: 20,21,22,23
+ Socket: 2
+ - Cpuset: 28,29,30,31
+ Socket: 3
+ exclusive: false
+ rdt: |+
+ # Common options
+ options:
+ # One of Full, Discovery or Disabled
+ mode: Full
+ # Set to true to disable creation of monitoring groups
+ monitoringDisabled: false
+ l3:
+ # Make this false if L3 CAT must be available
+ optional: true
+ mb:
+ # Make this false if MBA must be available
+ optional: true
+
+ # Configuration of classes
+ partitions:
+ exclusive:
+ # Allocate 60% of all L3 cache to the "exclusive" partition
+ l3Allocation: "60%"
+ mbAllocation: ["100%"]
+ classes:
+ Guaranteed:
+ # Allocate all of the partitions cache lines to "Guaranteed"
+ l3Allocation: "100%"
+ shared:
+ # Allocate 40% L3 cache IDs to the "shared" partition
+ # These will NOT overlap with the cache lines allocated for "exclusive" partition
+ l3Allocation: "40%"
+ mbAllocation: ["50%"]
+ classes:
+ Burstable:
+ # Allow "Burstable" to use all cache lines of the "shared" partition
+ l3Allocation: "100%"
+ BestEffort:
+ # Allow "Besteffort" to use only half of the L3 cache # lines of the "shared" partition.
+ # These will overlap with those used by "Burstable"
+ l3Allocation: "50%"
+ logger: |+
+ Debug: resource-manager,cache
diff --git a/sample-configs/topology-aware-policy.cfg b/sample-configs/topology-aware-policy.cfg
new file mode 100644
index 000000000..65ae79427
--- /dev/null
+++ b/sample-configs/topology-aware-policy.cfg
@@ -0,0 +1,6 @@
+policy:
+ Active: topology-aware
+ ReservedResources:
+ CPU: 750m
+logger:
+ Debug: nri-resmgr,resource-manager,cache
diff --git a/test/e2e/run.sh b/test/e2e/run.sh
index afab05085..159b7b0b5 100755
--- a/test/e2e/run.sh
+++ b/test/e2e/run.sh
@@ -77,6 +77,23 @@ if [ "$1" == "runtime-logs" ]; then
exit
fi
+script_source="$(< "$0") $(< "$LIB_DIR/vm.bash")"
+
+help() { # script API
+ # Usage: help [FUNCTION|all]
+ #
+ # Print help on all functions or on the FUNCTION available in script.
+ awk -v f="$1" \
+ '/^[a-z].*script API/{split($1,a,"(");if(f==""||f==a[1]||f=="all"){print "";print a[1]":";l=2}}
+ !/^ #/{l=l-1}
+ /^ #/{if(l>=1){split($0,a,"#"); print " "a[2]; if (f=="") l=0}}' <<<"$script_source"
+}
+
+if [ "$1" == "help" ]; then
+ help
+ exit 0
+fi
+
echo
echo " VM = $vm_name"
echo " Distro = $distro"