diff --git a/.gitignore b/.gitignore index 27410f884b5..93fcc668379 100644 --- a/.gitignore +++ b/.gitignore @@ -223,7 +223,6 @@ TAGS /doc/sphinx/*/conf.py /doc/sphinx/*/generated /doc/sphinx/build-[0-9]*.txt -/doc/sphinx/shared/images/*.png # Test artifacts (from unit tests, regression tests, static analysis, etc.) *.coverity diff --git a/INSTALL.md b/INSTALL.md index 26524d1aee3..a536eb7ff81 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -49,7 +49,6 @@ Also: | Linux-HA style fencing agents | | cluster-glue-libs-devel | libglue-devel | cluster-glue-dev | | documentation | | asciidoc or asciidoctor | asciidoc or asciidoctor | asciidoc or asciidoctor | | documentation | | help2man | help2man | help2man | -| documentation | | inkscape | inkscape | inkscape | | documentation | | docbook-style-xsl | docbook-xsl-stylesheets | docbook-xsl | | documentation | | python3-sphinx | python3-sphinx | python3-sphinx | | documentation (PDF) | | latexmk texlive texlive-capt-of texlive-collection-xetex texlive-fncychap texlive-framed texlive-multirow texlive-needspace texlive-tabulary texlive-titlesec texlive-threeparttable texlive-upquote texlive-wrapfig texlive-xetex | texlive texlive-latex | texlive texlive-latex-extra | diff --git a/agents/ocf/ping.in b/agents/ocf/ping.in index 73a3677c0a2..5a0642e1e82 100755 --- a/agents/ocf/ping.in +++ b/agents/ocf/ping.in @@ -368,7 +368,7 @@ hosts_family() { return $family } -integer=$(echo ${OCF_RESKEY_timeout} | egrep -o '[0-9]*') +integer=$(echo ${OCF_RESKEY_timeout} | $EGREP -o '[0-9]*') case "${OCF_RESKEY_timeout}" in *[0-9]ms|*[0-9]msec) OCF_RESKEY_timeout=$(expr $integer / 1000);; *[0-9]m|*[0-9]min) OCF_RESKEY_timeout=$(expr $integer \* 60);; diff --git a/configure.ac b/configure.ac index 477013cfa11..39f0b4baaf6 100644 --- a/configure.ac +++ b/configure.ac @@ -931,7 +931,6 @@ dnl ============================================== AC_PATH_PROGS([ASCIIDOC_CONV], [asciidoc asciidoctor]) AC_PATH_PROG([HELP2MAN], [help2man]) AC_PATH_PROG([SPHINX], [sphinx-build]) -AC_PATH_PROG([INKSCAPE], [inkscape]) AC_PATH_PROG([XSLTPROC], [xsltproc]) AC_PATH_PROG([XMLCATALOG], [xmlcatalog]) @@ -976,8 +975,7 @@ AM_CONDITIONAL([BUILD_ASCIIDOC], [test "x${ASCIIDOC_CONV}" != x]) AS_IF([test x"${ASCIIDOC_CONV}" != x""], [PCMK_FEATURES="$PCMK_FEATURES ascii-docs"]) -AM_CONDITIONAL([BUILD_SPHINX_DOCS], - [test x"${SPHINX}" != x"" && test x"${INKSCAPE}" != x""]) +AM_CONDITIONAL([BUILD_SPHINX_DOCS], [test x"${SPHINX}" != x""]) AM_COND_IF([BUILD_SPHINX_DOCS], [PCMK_FEATURES="$PCMK_FEATURES books"]) dnl Pacemaker's shell scripts (and thus man page builders) rely on GNU getopt diff --git a/cts/README.md b/cts/README.md index 595268d8b64..943ef1bf173 100644 --- a/cts/README.md +++ b/cts/README.md @@ -309,22 +309,3 @@ without requiring a password to be entered each time: If this works without prompting for a password, you're in business. If not, look at the documentation for your version of ssh. - - -## Upgrading scheduler test inputs for new XSLTs - -The scheduler/xml inputs should be kept in sync with the latest major schema -version, since these tests are not meant to test schema upgrades (unless -expressly designated as such). - -To upgrade the inputs to a new major schema version: - - cd "$(git rev-parse --show-toplevel)/xml" - ./regression.sh cts_scheduler -G - cd "$(git rev-parse --show-toplevel)/cts" - git add --interactive . - git commit -m 'Test: scheduler: upgrade test inputs to schema $X.$Y' - ./cts-scheduler || echo 'Investigate what went wrong' - -The first two commands can be run anytime to verify no further upgrades are -needed. diff --git a/cts/cts-exec.in b/cts/cts-exec.in index 6f00d68f01e..dceab9ee36c 100644 --- a/cts/cts-exec.in +++ b/cts/cts-exec.in @@ -308,11 +308,13 @@ class ExecTests(Tests): os.system("service pacemaker_remote stop") self.cleanup_environment() - if self.tls and not os.path.isfile("/etc/pacemaker/authkey"): - print("Installing /etc/pacemaker/authkey ...") - os.system("mkdir -p /etc/pacemaker") - os.system("dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1") - self._installed_files.append("/etc/pacemaker/authkey") + # @TODO Support the option of using specified existing certificates + authkey = "%s/authkey" % BuildOptions.PACEMAKER_CONFIG_DIR + if self.tls and not os.path.isfile(authkey): + print("Installing %s ..." % authkey) + os.system("mkdir -p %s" % BuildOptions.PACEMAKER_CONFIG_DIR) + os.system("dd if=/dev/urandom of=%s bs=4096 count=1" % authkey) + self._installed_files.append(authkey) # If we're in build directory, install agents if not already installed # pylint: disable=protected-access diff --git a/doc/shared/en-US/pacemaker-intro.txt b/doc/shared/en-US/pacemaker-intro.txt deleted file mode 100644 index b2a81cb757c..00000000000 --- a/doc/shared/en-US/pacemaker-intro.txt +++ /dev/null @@ -1,186 +0,0 @@ -:compat-mode: legacy -== What Is 'Pacemaker'? == - -*Pacemaker* is a high-availability 'cluster resource manager' -- software that -runs on a set of hosts (a 'cluster' of 'nodes') in order to preserve integrity -and minimize downtime of desired services ('resources'). -footnote:[ -'Cluster' is sometimes used in other contexts to refer to hosts grouped -together for other purposes, such as high-performance computing (HPC), but -Pacemaker is not intended for those purposes. -] -It is maintained by the https://www.ClusterLabs.org/[ClusterLabs] community. - -Pacemaker's key features include: - - * Detection of and recovery from node- and service-level failures - * Ability to ensure data integrity by fencing faulty nodes - * Support for one or more nodes per cluster - * Support for multiple resource interface standards (anything that can be - scripted can be clustered) - * Support (but no requirement) for shared storage - * Support for practically any redundancy configuration (active/passive, N+1, - etc.) - * Automatically replicated configuration that can be updated from any node - * Ability to specify cluster-wide relationships between services, - such as ordering, colocation and anti-colocation - * Support for advanced service types, such as 'clones' (services that need to - be active on multiple nodes), 'stateful resources' (clones that can run in - one of two modes), and containerized services - * Unified, scriptable cluster management tools - -.Fencing -[NOTE] -==== -'Fencing', also known as 'STONITH' (an acronym for Shoot The Other Node In The -Head), is the ability to ensure that it is not possible for a node to be -running a service. This is accomplished via 'fence devices' such as -intelligent power switches that cut power to the target, or intelligent -network switches that cut the target's access to the local network. - -Pacemaker represents fence devices as a special class of resource. - -A cluster cannot safely recover from certain failure conditions, such as an -unresponsive node, without fencing. -==== - -== Cluster Architecture == - -At a high level, a cluster can be viewed as having these parts (which together -are often referred to as the 'cluster stack'): - - * *Resources:* These are the reason for the cluster's being -- the services - that need to be kept highly available. - - * *Resource agents:* These are scripts or operating system components that - start, stop, and monitor resources, given a set of resource parameters. - These provide a uniform interface between Pacemaker and the managed - services. - - * *Fence agents:* These are scripts that execute node fencing actions, - given a target and fence device parameters. - - * *Cluster membership layer:* This component provides reliable - messaging, membership, and quorum information about the cluster. - Currently, Pacemaker supports http://www.corosync.org/[Corosync] - as this layer. - - * *Cluster resource manager:* Pacemaker provides the brain that processes - and reacts to events that occur in the cluster. These events may include - nodes joining or leaving the cluster; resource events caused by failures, - maintenance, or scheduled activities; and other administrative actions. - To achieve the desired availability, Pacemaker may start and stop resources - and fence nodes. - - * *Cluster tools:* These provide an interface for users to interact with the - cluster. Various command-line and graphical (GUI) interfaces are available. - -Most managed services are not, themselves, cluster-aware. However, many popular -open-source cluster filesystems make use of a common 'Distributed Lock -Manager' (DLM), which makes direct use of Corosync for its messaging and -membership capabilities and Pacemaker for the ability to fence nodes. - -.Example Cluster Stack -image::images/pcmk-stack.png["Example cluster stack",width="10cm",height="7.5cm",align="center"] - -== Pacemaker Architecture == - -Pacemaker itself is composed of multiple daemons that work together: - - * pacemakerd - * pacemaker-attrd - * pacemaker-based - * pacemaker-controld - * pacemaker-execd - * pacemaker-fenced - * pacemaker-schedulerd - -.Internal Components -image::images/pcmk-internals.png["Pacemaker software components",align="center",scaledwidth="65%"] - -The Pacemaker master process (pacemakerd) spawns all the other daemons, and -respawns them if they unexpectedly exit. - -The 'Cluster Information Base' (CIB) is an -https://en.wikipedia.org/wiki/XML[XML] representation of the cluster's -configuration and the state of all nodes and resources. The 'CIB manager' -(pacemaker-based) keeps the CIB synchronized across the cluster, and handles -requests to modify it. - -The attribute manager (pacemaker-attrd) maintains a database of attributes for -all nodes, keeps it synchronized across the cluster, and handles requests to -modify them. These attributes are usually recorded in the CIB. - -Given a snapshot of the CIB as input, the 'scheduler' (pacemaker-schedulerd) -determines what actions are necessary to achieve the desired state of the -cluster. - -The 'local executor' (pacemaker-execd) handles requests to execute -resource agents on the local cluster node, and returns the result. - -The 'fencer' (pacemaker-fenced) handles requests to fence nodes. Given a target -node, the fencer decides which cluster node(s) should execute which fencing -device(s), and calls the necessary fencing agents (either directly, or via -requests to the fencer peers on other nodes), and returns the result. - -The 'controller' (pacemaker-controld) is Pacemaker's coordinator, -maintaining a consistent view of the cluster membership and orchestrating all -the other components. - -Pacemaker centralizes cluster decision-making by electing one of the controller -instances as the 'Designated Controller' ('DC'). Should the elected DC -process (or the node it is on) fail, a new one is quickly established. -The DC responds to cluster events by taking a current snapshot of the CIB, -feeding it to the scheduler, then asking the executors (either directly on -the local node, or via requests to controller peers on other nodes) and -the fencer to execute any necessary actions. - -.Old daemon names -[NOTE] -==== -The Pacemaker daemons were renamed in version 2.0. You may still find -references to the old names, especially in documentation targeted to version -1.1. - -[width="95%",cols="1,2",options="header",align="center"] -|========================================================= -| Old name | New name -| attrd | pacemaker-attrd -| cib | pacemaker-based -| crmd | pacemaker-controld -| lrmd | pacemaker-execd -| stonithd | pacemaker-fenced -| pacemaker_remoted | pacemaker-remoted -|========================================================= - -==== - -== Node Redundancy Designs == - -Pacemaker supports practically any -https://en.wikipedia.org/wiki/High-availability_cluster#Node_configurations[node -redundancy configuration] including 'Active/Active', 'Active/Passive', 'N+1', -'N+M', 'N-to-1' and 'N-to-N'. - -Active/passive clusters with two (or more) nodes using Pacemaker and -https://en.wikipedia.org/wiki/Distributed_Replicated_Block_Device:[DRBD] are -a cost-effective high-availability solution for many situations. One of the -nodes provides the desired services, and if it fails, the other node takes -over. - -.Active/Passive Redundancy -image::images/pcmk-active-passive.png["Active/Passive Redundancy",width="10cm",height="7.5cm",align="center"] - -Pacemaker also supports multiple nodes in a shared-failover design, -reducing hardware costs by allowing several active/passive clusters to be -combined and share a common backup node. - -.Shared Failover -image::images/pcmk-shared-failover.png["Shared Failover",width="10cm",height="7.5cm",align="center"] - -When shared storage is available, every node can potentially be used for -failover. Pacemaker can even run multiple copies of services to spread out the -workload. - -.N to N Redundancy -image::images/pcmk-active-active.png["N to N Redundancy",width="10cm",height="7.5cm",align="center"] diff --git a/doc/sphinx/Makefile.am b/doc/sphinx/Makefile.am index e6845f3ad9a..da938132a23 100644 --- a/doc/sphinx/Makefile.am +++ b/doc/sphinx/Makefile.am @@ -43,32 +43,31 @@ SPHINXFLAGS ?= # End of useful overrides -# Example scheduler transition graphs -# @TODO The original CIB XML for these is long lost. Ideally, we would recreate -# something similar and keep those here instead of the DOTs (or use a couple of -# scheduler regression test inputs instead), then regenerate the SVG -# equivalents using crm_simulate and dot when making a release. -DOTS = $(wildcard shared/images/*.dot) - -# Vector sources for generated PNGs (including SVG equivalents of DOTS, created -# manually using dot) -SVGS = $(wildcard shared/images/pcmk-*.svg) \ - $(DOTS:%.dot=%.svg) - -# PNG images generated from SVGS +# All images needed by the documentation are PNGs (added to the source +# repository and distributions). Some of these PNGs can be regenerated +# manually from DOT and/or SVG sources (also in the source repository, but +# not distributed). # -# These will not be accessible in a VPATH build, which will generate warnings -# when building the documentation, but the make will still succeed. It is -# nontrivial to get them working for VPATH builds and not worth the effort. -PNGS_GENERATED = $(SVGS:%.svg=%.png) - -# Original PNG image sources -PNGS_Clusters_from_Scratch = $(wildcard Clusters_from_Scratch/images/*.png) -PNGS_Pacemaker_Explained = $(wildcard Pacemaker_Explained/images/*.png) +# To regenerate an SVG from a DOT, you can use dot: +# +# dot $NAME.dot -Tsvg > $NAME.svg +# +# To regenerate a PNG from an SVG, you can use Inkscape (>= 1.0): +# +# inkscape --export-dpi=90 -C --export-filename=$NAME.png $NAME.svg +# +# @TODO The original CIB XML for the example scheduler transitions +# (Policy-Engine-*) is long lost. Ideally, we would recreate something similar +# and keep that XML here (or use a couple of scheduler regression test inputs +# instead). Then the DOTs could be regenerated as well, using crm_simulate. +PNGS_shared = $(wildcard shared/images/*.png) +PNGS_Clusters_from_Scratch = $(wildcard Clusters_from_Scratch/images/*.png) +PNGS_Pacemaker_Explained = $(wildcard Pacemaker_Explained/images/*.png) STATIC_FILES = $(wildcard _static/*.css) -EXTRA_DIST = $(wildcard */*.rst) $(DOTS) $(SVGS) \ +EXTRA_DIST = $(wildcard */*.rst) \ + $(PNGS_shared) \ $(PNGS_Clusters_from_Scratch) \ $(PNGS_Pacemaker_Explained) \ $(wildcard Pacemaker_Python_API/_templates/*rst) \ @@ -81,8 +80,8 @@ BOOK_RSYNC_DEST = $(RSYNC_PACKAGE_DEST)/doc/$(PACKAGE_SERIES) BOOK = none -DEPS_intro = shared/pacemaker-intro.rst \ - $(PNGS_GENERATED) +DEPS_intro = shared/pacemaker-intro.rst \ + $(PNGS_shared) DEPS_Clusters_from_Scratch = $(DEPS_intro) \ $(PNGS_Clusters_from_Scratch) @@ -94,18 +93,6 @@ DEPS_Pacemaker_Python_API = ../../python if BUILD_SPHINX_DOCS -INKSCAPE_CMD = $(INKSCAPE) --export-dpi=90 -C - -# Pattern rule to generate PNGs from SVGs -# (--export-png works with Inkscape <1.0, --export-filename with >=1.0; -# create the destination directory in case this is a VPATH build) -%.png: %.svg - $(AM_V_at)-$(MKDIR_P) "$(shell dirname "$@")" - $(AM_V_GEN) { \ - $(INKSCAPE_CMD) --export-png="$@" "$<" 2>/dev/null \ - || $(INKSCAPE_CMD) --export-filename="$@" "$<"; \ - } $(PCMK_quiet) - # Create a book's Sphinx configuration. # Create the book directory in case this is a VPATH build. $(BOOKS:%=%/conf.py): conf.py.in @@ -221,5 +208,4 @@ clean-local: $(AM_V_at)-rm -rf \ $(BOOKS:%="$(builddir)/%/_build") \ $(BOOKS:%="$(builddir)/%/conf.py") \ - $(BOOKS:%="$(builddir)/%/generated") \ - $(PNGS_GENERATED) + $(BOOKS:%="$(builddir)/%/generated") diff --git a/doc/sphinx/Pacemaker_Administration/alerts.rst b/doc/sphinx/Pacemaker_Administration/alerts.rst index ea2b0f9ed28..05424dca0b8 100644 --- a/doc/sphinx/Pacemaker_Administration/alerts.rst +++ b/doc/sphinx/Pacemaker_Administration/alerts.rst @@ -110,164 +110,206 @@ Writing an Alert Agent single: alert; environment variables single: environment variable; alert agents -.. table:: **Environment variables passed to alert agents** +.. list-table:: **Environment variables passed to alert agents** :class: longtable - :widths: 1 3 - - +---------------------------+----------------------------------------------------------------+ - | Environment Variable | Description | - +===========================+================================================================+ - | CRM_alert_kind | .. index:: | - | | single:environment variable; CRM_alert_kind | - | | single:CRM_alert_kind | - | | | - | | The type of alert (``node``, ``fencing``, ``resource``, or | - | | ``attribute``) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_node | .. index:: | - | | single:environment variable; CRM_alert_node | - | | single:CRM_alert_node | - | | | - | | Name of affected node | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_node_sequence | .. index:: | - | | single:environment variable; CRM_alert_sequence | - | | single:CRM_alert_sequence | - | | | - | | A sequence number increased whenever an alert is being issued | - | | on the local node, which can be used to reference the order in | - | | which alerts have been issued by Pacemaker. An alert for an | - | | event that happened later in time reliably has a higher | - | | sequence number than alerts for earlier events. | - | | | - | | Be aware that this number has no cluster-wide meaning. | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_recipient | .. index:: | - | | single:environment variable; CRM_alert_recipient | - | | single:CRM_alert_recipient | - | | | - | | The configured recipient | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_timestamp | .. index:: | - | | single:environment variable; CRM_alert_timestamp | - | | single:CRM_alert_timestamp | - | | | - | | A timestamp created prior to executing the agent, in the | - | | format specified by the ``timestamp-format`` meta-attribute. | - | | This allows the agent to have a reliable, high-precision time | - | | of when the event occurred, regardless of when the agent | - | | itself was invoked (which could potentially be delayed due to | - | | system load, etc.). | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_timestamp_epoch | .. index:: | - | | single:environment variable; CRM_alert_timestamp_epoch | - | | single:CRM_alert_timestamp_epoch | - | | | - | | The same time as ``CRM_alert_timestamp``, expressed as the | - | | integer number of seconds since January 1, 1970. This (along | - | | with ``CRM_alert_timestamp_usec``) can be useful for alert | - | | agents that need to format time in a specific way rather than | - | | let the user configure it. | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_timestamp_usec | .. index:: | - | | single:environment variable; CRM_alert_timestamp_usec | - | | single:CRM_alert_timestamp_usec | - | | | - | | The same time as ``CRM_alert_timestamp``, expressed as the | - | | integer number of microseconds since | - | | ``CRM_alert_timestamp_epoch``. | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_version | .. index:: | - | | single:environment variable; CRM_alert_version | - | | single:CRM_alert_version | - | | | - | | The version of Pacemaker sending the alert | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_desc | .. index:: | - | | single:environment variable; CRM_alert_desc | - | | single:CRM_alert_desc | - | | | - | | Detail about event. For ``node`` alerts, this is the node's | - | | current state (``member`` or ``lost``). For ``fencing`` | - | | alerts, this is a summary of the requested fencing operation, | - | | including origin, target, and fencing operation error code, if | - | | any. For ``resource`` alerts, this is a readable string | - | | equivalent of ``CRM_alert_status``. | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_nodeid | .. index:: | - | | single:environment variable; CRM_alert_nodeid | - | | single:CRM_alert_nodeid | - | | | - | | ID of node whose status changed (provided with ``node`` alerts | - | | only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_rc | .. index:: | - | | single:environment variable; CRM_alert_rc | - | | single:CRM_alert_rc | - | | | - | | The numerical return code of the fencing or resource operation | - | | (provided with ``fencing`` and ``resource`` alerts only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_task | .. index:: | - | | single:environment variable; CRM_alert_task | - | | single:CRM_alert_task | - | | | - | | The requested fencing or resource operation (provided with | - | | ``fencing`` and ``resource`` alerts only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_exec_time | .. index:: | - | | single:environment variable; CRM_alert_exec_time | - | | single:CRM_alert_exec_time | - | | | - | | The (wall-clock) time, in milliseconds, that it took to | - | | execute the action. If the action timed out, | - | | ``CRM_alert_status`` will be 2, ``CRM_alert_desc`` will be | - | | "Timed Out", and this value will be the action timeout. May | - | | not be supported on all platforms. (``resource`` alerts only) | - | | *(since 2.0.1)* | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_interval | .. index:: | - | | single:environment variable; CRM_alert_interval | - | | single:CRM_alert_interval | - | | | - | | The interval of the resource operation (``resource`` alerts | - | | only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_rsc | .. index:: | - | | single:environment variable; CRM_alert_rsc | - | | single:CRM_alert_rsc | - | | | - | | The name of the affected resource (``resource`` alerts only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_status | .. index:: | - | | single:environment variable; CRM_alert_status | - | | single:CRM_alert_status | - | | | - | | A numerical code used by Pacemaker to represent the operation | - | | result (``resource`` alerts only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_target_rc | .. index:: | - | | single:environment variable; CRM_alert_target_rc | - | | single:CRM_alert_target_rc | - | | | - | | The expected numerical return code of the operation | - | | (``resource`` alerts only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_attribute_name | .. index:: | - | | single:environment variable; CRM_alert_attribute_name | - | | single:CRM_alert_attribute_name | - | | | - | | The name of the node attribute that changed (``attribute`` | - | | alerts only) | - +---------------------------+----------------------------------------------------------------+ - | CRM_alert_attribute_value | .. index:: | - | | single:environment variable; CRM_alert_attribute_value | - | | single:CRM_alert_attribute_value | - | | | - | | The new value of the node attribute that changed | - | | (``attribute`` alerts only) | - +---------------------------+----------------------------------------------------------------+ + :widths: 1 3 1 + :header-rows: 1 + * - Environment Variable + - Description + - Alert Types + * - .. _CRM_alert_kind: + + .. index:: + single: environment variable; CRM_alert_kind + single: CRM_alert_kind + + CRM_alert_kind + - The type of alert (``node``, ``fencing``, ``resource``, or + ``attribute``) + - all + * - .. _CRM_alert_node: + + .. index:: + single: environment variable; CRM_alert_node + single: CRM_alert_node + + CRM_alert_node + - Name of affected node + - all + * - .. _CRM_alert_node_sequence: + + .. index:: + single: environment variable; CRM_alert_node_sequence + single: CRM_alert_node_sequence + + CRM_alert_node_sequence + - A sequence number increased whenever an alert is being issued on the + local node, which can be used to reference the order in which alerts + have been issued by Pacemaker. An alert for an event that happened later + in time reliably has a higher sequence number than alerts for earlier + events. This number has no cluster-wide meaning. + - all + * - .. _CRM_alert_recipient: + + .. index:: + single: environment variable; CRM_alert_recipient + single: CRM_alert_recipient + + CRM_alert_recipient + - The configured recipient + - all + * - .. _CRM_alert_timestamp: + + .. index:: + single: environment variable; CRM_alert_timestamp + single: CRM_alert_timestamp + + CRM_alert_timestamp + - A timestamp created prior to executing the agent, in the format + specified by the ``timestamp-format`` meta-attribute. This allows the + agent to have a reliable, high-precision time of when the event + occurred, regardless of when the agent itself was invoked (which could + potentially be delayed due to system load, etc.). + - all + * - .. _CRM_alert_timestamp_epoch: + + .. index:: + single: environment variable; CRM_alert_timestamp_epoch + single: CRM_alert_timestamp_epoch + + CRM_alert_timestamp_epoch + - The same time as ``CRM_alert_timestamp``, expressed as the integer + number of seconds since January 1, 1970. This (along with + ``CRM_alert_timestamp_usec``) can be useful for alert agents that need + to format time in a specific way rather than let the user configure it. + - all + * - .. _CRM_alert_timestamp_usec: + + .. index:: + single: environment variable; CRM_alert_timestamp_usec + single: CRM_alert_timestamp_usec + + CRM_alert_timestamp_usec + - The same time as ``CRM_alert_timestamp``, expressed as the integer + number of microseconds since ``CRM_alert_timestamp_epoch``. + - all + * - .. _CRM_alert_version: + + .. index:: + single: environment variable; CRM_alert_version + single: CRM_alert_version + + CRM_alert_version + - The version of Pacemaker sending the alert + - all + * - .. _CRM_alert_desc: + + .. index:: + single: environment variable; CRM_alert_desc + single: CRM_alert_desc + + CRM_alert_desc + - Detail about event. For ``node`` alerts, this is the node's current + state (``member`` or ``lost``). For ``fencing`` alerts, this is a + summary of the requested fencing operation, including origin, target, + and fencing operation error code, if any. For ``resource`` alerts, this + is a readable string equivalent of ``CRM_alert_status``. + - ``node``, ``fencing``, ``resource`` + * - .. _CRM_alert_nodeid: + + .. index:: + single: environment variable; CRM_alert_nodeid + single: CRM_alert_nodeid + + CRM_alert_nodeid + - ID of node whose status changed + - ``node`` + * - .. _CRM_alert_rc: + + .. index:: + single: environment variable; CRM_alert_rc + single: CRM_alert_rc + + CRM_alert_rc + - The numerical return code of the fencing or resource operation + - ``fencing``, ``resource`` + * - .. _CRM_alert_task: + + .. index:: + single: environment variable; CRM_alert_task + single: CRM_alert_task + + CRM_alert_task + - The requested fencing or resource operation + - ``fencing``, ``resource`` + * - .. _CRM_alert_exec_time: + + .. index:: + single: environment variable; CRM_alert_exec_time + single: CRM_alert_exec_time + + CRM_alert_exec_time + - The (wall-clock) time, in milliseconds, that it took to execute the + action. If the action timed out, ``CRM_alert_status`` will be 2, + ``CRM_alert_desc`` will be "Timed Out", and this value will be the + action timeout. May not be supported on all platforms. *(since 2.0.1)* + - ``resource`` + * - .. _CRM_alert_interval: + + .. index:: + single: environment variable; CRM_alert_interval + single: CRM_alert_interval + + CRM_alert_interval + - The interval of the resource operation + - ``resource`` + * - .. _CRM_alert_rsc: + + .. index:: + single: environment variable; CRM_alert_rsc + single: CRM_alert_rsc + + CRM_alert_rsc + - The name of the affected resource + - ``resource`` + * - .. _CRM_alert_status: + + .. index:: + single: environment variable; CRM_alert_status + single: CRM_alert_status + + CRM_alert_status + - A numerical code used by Pacemaker to represent the operation result + - ``resource`` + * - .. _CRM_alert_target_rc: + + .. index:: + single: environment variable; CRM_alert_target_rc + single: CRM_alert_target_rc + + CRM_alert_target_rc + - The expected numerical return code of the operation + - ``resource`` + * - .. _CRM_alert_attribute_name: + + .. index:: + single: environment variable; CRM_alert_attribute_name + single: CRM_alert_attribute_name + + CRM_alert_attribute_name + - The name of the node attribute that changed + - ``attribute`` + * - .. _CRM_alert_attribute_value: + + .. index:: + single: environment variable; CRM_alert_attribute_value + single: CRM_alert_attribute_value + + CRM_alert_attribute_value + - The new value of the node attribute that changed + - ``attribute`` + Special concerns when writing alert agents: * Alert agents may be called with no recipient (if none is configured), diff --git a/doc/sphinx/Pacemaker_Administration/troubleshooting.rst b/doc/sphinx/Pacemaker_Administration/troubleshooting.rst index 22c9dc861c6..4f24725979f 100644 --- a/doc/sphinx/Pacemaker_Administration/troubleshooting.rst +++ b/doc/sphinx/Pacemaker_Administration/troubleshooting.rst @@ -91,6 +91,11 @@ actions were scheduled. The ``crm_simulate`` command, described in The log messages immediately before the "saving inputs" message will include any actions that the scheduler thinks need to be done. +.. important:: + + Any actions that have already been initiated must complete (or time out) + before a new transition can be calculated. + Node Failures ############# diff --git a/doc/sphinx/Pacemaker_Development/c.rst b/doc/sphinx/Pacemaker_Development/c.rst index 3f3c9746fb2..443f9862df1 100644 --- a/doc/sphinx/Pacemaker_Development/c.rst +++ b/doc/sphinx/Pacemaker_Development/c.rst @@ -782,6 +782,18 @@ initialization procedure on the new object. * If the constructor may add the new object to some existing object, its name should include ``create``. +Functions that take the caller's name as an argument +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Sometimes, we define a function that uses the ``__FILE__``, ``__func__``, +and/or ``__LINE__`` of the caller for logging purposes, often with a wrapper +macro that automatically passes them. + +* The function should take those values as its first arguments. +* The function name should end in ``_as()``. +* If a wrapper macro is used, its name should be the same without ``_as()``. +* See ``pcmk__assert_alloc()`` and ``pcmk__assert_alloc_as()`` as examples. + Function Definitions ____________________ @@ -1095,6 +1107,14 @@ Pacemaker may crash with a segmentation fault, or change tracking and ACL checking may be incorrectly disabled. +XPaths +______ + +Separting XPath element names with ``/`` (specifying each level in the +hierarchy explicitly) is more efficient than ``//`` (allowing intermediate +levels to be omitted), so it should be used whenever practical. + + .. index:: single: Makefile.am diff --git a/doc/sphinx/Pacemaker_Explained/acls.rst b/doc/sphinx/Pacemaker_Explained/acls.rst index c3de39d0ded..878f8f64b37 100644 --- a/doc/sphinx/Pacemaker_Explained/acls.rst +++ b/doc/sphinx/Pacemaker_Explained/acls.rst @@ -36,6 +36,12 @@ In order to use ACLs: supports ACLs only if the output of the command ``pacemakerd --features`` contains ``acls``. In newer versions, ACLs are always enabled. +.. important:: + + ``enable-acl`` should be set either by the root user, or as part of a batch + of CIB changes including roles and users. Otherwise, the user setting it + might lock themselves out from making any further changes. + .. index:: single: Access Control List (ACL); acls @@ -78,7 +84,7 @@ element in the CIB ``acls`` section. | | single: description; acl_role attribute | | | single: attribute; description (acl_role) | | | | - | | Arbitrary text (not used by Pacemaker) | + | | Arbitrary text for user's use (ignored by Pacemaker) | +------------------+-----------------------------------------------------------+ An ``acl_role`` element may contain any number of ``acl_permission`` elements. @@ -105,7 +111,7 @@ An ``acl_role`` element may contain any number of ``acl_permission`` elements. | | single: description; acl_permission attribute | | | single: attribute; description (acl_permission) | | | | - | | Arbitrary text (not used by Pacemaker) | + | | Arbitrary text for user's use (ignored by Pacemaker) | +------------------+-----------------------------------------------------------+ | kind | .. index:: | | | single: acl_permission; kind (attribute) | @@ -280,6 +286,16 @@ elements. is true, permission to all parts of the CIB is denied by default (permissions must be explicitly granted). + +ACLs and Pacemaker Remote Nodes +############################### + +ACLs apply differently on Pacemaker Remote nodes, which are assumed to be +special-purpose hosts without typical user accounts. Instead, CIB modifications +coming from a Pacemaker Remote node use the node's name as the ACL user name, +and ``pacemaker-remote`` as the role. + + ACL Examples ############ diff --git a/doc/sphinx/Pacemaker_Explained/alerts.rst b/doc/sphinx/Pacemaker_Explained/alerts.rst index f4cad72cb76..27000ed9410 100644 --- a/doc/sphinx/Pacemaker_Explained/alerts.rst +++ b/doc/sphinx/Pacemaker_Explained/alerts.rst @@ -90,13 +90,20 @@ Alert Meta-Attributes As with resources, meta-attributes can be configured for alerts to change whether and how Pacemaker calls them. -.. table:: **Meta-Attributes of an Alert** +.. table:: **Meta-Attributes of an Alert or Recipient** :class: longtable :widths: 1 1 3 +------------------+---------------+-----------------------------------------------------+ | Meta-Attribute | Default | Description | +==================+===============+=====================================================+ + | description | | .. index:: | + | | | single: acl_permission; description (attribute) | + | | | single: description; acl_permission attribute | + | | | single: attribute; description (acl_permission) | + | | | | + | | | Arbitrary text for user's use (ignored by Pacemaker)| + +------------------+---------------+-----------------------------------------------------+ | enabled | true | .. index:: | | | | single: alert; meta-attribute, enabled | | | | single: meta-attribute; enabled (alert) | diff --git a/doc/sphinx/Pacemaker_Explained/ap-samples.rst b/doc/sphinx/Pacemaker_Explained/ap-samples.rst index 641affc96ed..35188a87506 100644 --- a/doc/sphinx/Pacemaker_Explained/ap-samples.rst +++ b/doc/sphinx/Pacemaker_Explained/ap-samples.rst @@ -101,12 +101,12 @@ Advanced Configuration - + - + diff --git a/doc/sphinx/Pacemaker_Explained/cluster-options.rst b/doc/sphinx/Pacemaker_Explained/cluster-options.rst index e4b579bbd05..6ebe5f38ebd 100644 --- a/doc/sphinx/Pacemaker_Explained/cluster-options.rst +++ b/doc/sphinx/Pacemaker_Explained/cluster-options.rst @@ -518,6 +518,10 @@ values, by running the ``man pacemaker-schedulerd`` and further protection (which can mean *data loss* if the unresponsive node still accesses shared storage, for example). See also the :ref:`requires ` resource meta-attribute. + + This option applies only to fencing scheduled by the cluster, not to + requests initiated externally (such as with the ``stonith_admin`` + command-line tool). * - .. _stonith_action: .. index:: @@ -813,10 +817,19 @@ values, by running the ``man pacemaker-schedulerd`` and - Pacemaker is primarily event-driven, and looks ahead to know when to recheck the cluster for failure-timeout settings and most time-based rules *(since 2.0.3)*. However, it will also recheck the cluster after - this amount of inactivity. This has two goals: rules with ``date_spec`` - are only guaranteed to be checked this often, and it also serves as a - fail-safe for some kinds of scheduler bugs. A value of 0 disables this - polling. + this amount of inactivity. This has three main effects: + + * :ref:`Rules ` using ``date_spec`` are guaranteed to be checked + only this often. + * If :ref:`fencing ` fails enough to reach + :ref:`stonith-max-attempts `, attempts will + begin again after at most this time. + * It serves as a fail-safe in case of certain scheduler bugs. If the + scheduler incorrectly determines only some of the actions needed to + react to a particular event, it will often correctly determine the + rest after at most this time. + + A value of 0 disables this polling. * - .. _shutdown_lock: .. index:: diff --git a/doc/sphinx/Pacemaker_Explained/collective.rst b/doc/sphinx/Pacemaker_Explained/collective.rst index 36655575749..93b0447a062 100644 --- a/doc/sphinx/Pacemaker_Explained/collective.rst +++ b/doc/sphinx/Pacemaker_Explained/collective.rst @@ -33,7 +33,7 @@ of groups. - + Although the example above contains only two resources, there is no @@ -65,7 +65,7 @@ The group above is logically equivalent to writing: - + @@ -103,9 +103,7 @@ ________________ | | single: attribute; description (group) | | | single: description; group attribute | | | | - | | An optional description of the group, for the user's own | - | | purposes. | - | | E.g. ``resources needed for website`` | + | | Arbitrary text for user's use (ignored by Pacemaker) | +-------------+------------------------------------------------------------------+ Group Options @@ -231,9 +229,7 @@ ________________ | | single: attribute; description (clone) | | | single: description; clone attribute | | | | - | | An optional description of the clone, for the user's own | - | | purposes. | - | | E.g. ``IP address for website`` | + | | Arbitrary text for user's use (ignored by Pacemaker) | +-------------+------------------------------------------------------------------+ .. index:: @@ -378,7 +374,7 @@ Clones must contain exactly one primitive or group resource. .. code-block:: xml - + @@ -715,9 +711,7 @@ _________________ | | single: attribute; description (bundle) | | | single: description; bundle attribute | | | | - | | An optional description of the group, for the user's own | - | | purposes. | - | | E.g. ``manages the container that runs the service`` | + | | Arbitrary text for user's use (ignored by Pacemaker) | +-------------+------------------------------------------------------------------+ diff --git a/doc/sphinx/Pacemaker_Explained/constraints.rst b/doc/sphinx/Pacemaker_Explained/constraints.rst index 2202bbe0fcd..7d2f39c58e8 100644 --- a/doc/sphinx/Pacemaker_Explained/constraints.rst +++ b/doc/sphinx/Pacemaker_Explained/constraints.rst @@ -775,9 +775,37 @@ have an effect in all contexts. | | | single: score; resource_set attribute | | | | | | | | *Advanced use only.* Use a specific score for this | - | | | set within the constraint. | + | | | set. Meaningful within ``rsc_location`` or | + | | | ``rsc_colocation``. | + +-------------+------------------+--------------------------------------------------------+ + | kind | | .. index:: | + | | | single: resource_set; attribute, kind | + | | | single: attribute; kind (resource_set) | + | | | single: kind; resource_set attribute | + | | | | + | | | *Advanced use only.* Use a specific kind for this | + | | | set. Meaningful within ``rsc_order``. | +-------------+------------------+--------------------------------------------------------+ +Anti-colocation Chains +______________________ + +Sometimes, you would like a set of resources to be anti-colocated with each +other. For example, ``resource1``, ``resource2``, and ``resource3`` must all +run on different nodes. + +A straightforward approach would be to configure either separate colocations or +a resource set, with ``-INFINITY`` scores between all the resources. + +However, this will not work as expected. + +Resource sets may in the future gain new syntax for this specific situation, +but for now, a workaround is to use :ref:`utilization ` instead of +colocations to keep the resources apart. Create a utilization attribute for the +anti-colocation, assign the same value to each resource, and give each node the +capacity to run one resource. + + .. _s-resource-sets-ordering: Ordering Sets of Resources diff --git a/doc/sphinx/Pacemaker_Explained/index.rst b/doc/sphinx/Pacemaker_Explained/index.rst index 63387f3421e..68139809c04 100644 --- a/doc/sphinx/Pacemaker_Explained/index.rst +++ b/doc/sphinx/Pacemaker_Explained/index.rst @@ -25,12 +25,12 @@ Table of Contents operations constraints fencing - alerts - rules collective - reusing-configuration utilization + rules acls + alerts + reusing-configuration status multi-site-clusters ap-samples diff --git a/doc/sphinx/Pacemaker_Explained/nodes.rst b/doc/sphinx/Pacemaker_Explained/nodes.rst index 89acbf4e841..e88e63a10e0 100644 --- a/doc/sphinx/Pacemaker_Explained/nodes.rst +++ b/doc/sphinx/Pacemaker_Explained/nodes.rst @@ -20,6 +20,30 @@ toward cluster quorum, and serve as the cluster's Designated Controller (DC). Every cluster must have at least one cluster node. Scalability is limited by the cluster layer to around 32 cluster nodes. +Host Clock Considerations +######################### + +In general, Pacemaker does not rely on time or time zones being synchronized +across nodes. However, if the configuration uses date/time-based :ref:`rules +`, synchronization is a good idea, otherwise the rules will evaluate +differently depending on which node is the Designated Controller (DC). Also, +synchronization is greatly helpful when comparing logs across multiple nodes +for problem investigation. + +If a node's clock jumps forward, you may see relatively minor issues such as +various timeouts suddenly being considered expired. + +If a node's clock jumps backward, more serious problems may occur, so this +should be avoided. If the host clock is adjusted at boot, and Pacemaker is +enabled at boot, Pacemaker's start should be ordered after the clock +adjustment. When run under systemd, Pacemaker will automatically order itself +after ``time-sync.target``. However, depending on the local setup, you may need +to enable an additional service (for example, ``chronyd-wait.service``) for +that to be effective, or write your own workaround (for example, see the +discussion on +`systemd issue#5097 `_. + + .. _pacemaker_remote: .. index:: @@ -108,6 +132,8 @@ be the same as its local hostname. Pacemaker uses the following for a cluster node's name, in order of most preferred first: * The value of ``name`` in the ``nodelist`` section of ``corosync.conf`` + (``nodeid`` must also be explicitly set there in order for Pacemaker to + associate the name with the node) * The value of ``ring0_addr`` in the ``nodelist`` section of ``corosync.conf`` * The local hostname (value of ``uname -n``) @@ -125,6 +151,29 @@ node ID will display the name used by the node with the given Corosync crm_node --name-for-id 2 +.. index:: + single: node; quorum-only + single: quorum-only node + +Quorum-only Nodes +_________________ + +One popular cluster design uses an even number of cluster nodes (often 2), with +an additional lightweight host that contributes to providing quorum but cannot +run resources. + +With Pacemaker, this can be achieved in either of two ways: + +* When Corosync is used as the underlying cluster layer, the lightweight host + can run `qdevice `_ instead of + Corosync and Pacemaker. + +* The lightweight host can be configured as a Pacemaker cluster node, and a + :ref:`location constraint ` can be configured for the + node with ``score`` set to ``-INFINITY``, ``rsc-pattern`` set to ``.*``, and + ``resource-discovey`` set to ``never``. + + .. index:: single: node; attribute single: node attribute @@ -400,7 +449,8 @@ following values: | | single: yellow; node health attribute value | | | single: node attribute; health (yellow) | | | | - | | This indicator is becoming unhealthy | + | | This indicator is close to unhealthy (whether worsening or | + | | recovering) | +------------+--------------------------------------------------------------+ | ``green`` | .. index:: | | | single: green; node health attribute value | @@ -416,6 +466,16 @@ following values: | | positive is healthy, negative is unhealthy) | +------------+--------------------------------------------------------------+ +.. note:: + + A health attribute may technically be transient or permanent, but generally + only transient makes sense. + +.. note:: + + ``red``, ``yellow``, and ``green`` function as aliases for particular + numeric scores as described later. + .. index:: pair: cluster option; node-health-strategy diff --git a/doc/sphinx/Pacemaker_Explained/operations.rst b/doc/sphinx/Pacemaker_Explained/operations.rst index b8a324b8ab9..b5488268e82 100644 --- a/doc/sphinx/Pacemaker_Explained/operations.rst +++ b/doc/sphinx/Pacemaker_Explained/operations.rst @@ -98,6 +98,17 @@ If not specified, the default from the table below is used. this is 0, Pacemaker will apply other properties configured for this operation to instances that are scheduled as needed during normal cluster operation. *(required)* + * - .. _op_description: + + .. index:: + pair: op; description + single: description; action property + single: action; property, description + + description + - :ref:`text ` + - + - Arbitrary text for user's use (ignored by Pacemaker) * - .. _op_role: .. index:: @@ -173,6 +184,44 @@ If not specified, the default from the table below is used. (:ref:`is-managed ` set to ``false``), which does not stop recurring operations. Maintenance mode, which does stop configured monitors, overrides this setting. + * - .. _op_interval_origin: + + .. index:: + pair: op; interval-origin + single: interval-origin; action property + single: action; property, interval-origin + + interval-origin + - :ref:`ISO 8601 ` + - + - If set for a recurring action, the action will be scheduled for this + time plus a multiple of the action's interval, rather than immediately + after the resource gains the monitored role. For example, you might + schedule an in-depth monitor to run once per day outside business hours, + by setting this to the desired time (on any date) and setting + ``interval`` to ``24h``. At most one of ``interval-origin`` and + ``start-delay`` may be set. + * - .. _op_start_delay: + + .. index:: + pair: op; start-delay + single: start-delay; action property + single: action; property, start-delay + + start-delay + - :ref:`duration ` + - + - If set, the cluster will wait this long before running the action (for + the first time, if recurring). This is an advanced option that should + generally be avoided. It can be useful for a recurring monitor if a + resource agent incorrectly returns success from start before the service + is actually ready, and the agent can't be corrected, or for a start + action if a service takes a very long time to start, and you don't want + to block the cluster from responding to other events during that time. + If this delay is longer than 5 minutes, the cluster will pretend that + the action succeeded when it is first scheduled for the purpose of other + actions needed, then act on the result when it actually runs. At most + one of ``interval-origin`` and ``start-delay`` may be set. * - .. _op_record_pending: .. index:: @@ -403,42 +452,6 @@ Once you've done whatever you needed to do, you can then re-enable it with # cibadmin --modify --xml-text '' -.. index:: - single: start-delay; operation attribute - single: interval-origin; operation attribute - single: interval; interval-origin - single: operation; interval-origin - single: operation; start-delay - -Specifying When Recurring Actions are Performed -############################################### - -By default, recurring actions are scheduled relative to when the resource -started. In some cases, you might prefer that a recurring action start relative -to a specific date and time. For example, you might schedule an in-depth -monitor to run once every 24 hours, and want it to run outside business hours. - -To do this, set the operation's ``interval-origin``. The cluster uses this point -to calculate the correct ``start-delay`` such that the operation will occur -at ``interval-origin`` plus a multiple of the operation interval. - -For example, if the recurring operation's interval is 24h, its -``interval-origin`` is set to 02:00, and it is currently 14:32, then the -cluster would initiate the operation after 11 hours and 28 minutes. - -The value specified for ``interval`` and ``interval-origin`` can be any -date/time conforming to the -`ISO8601 standard `_. By way of -example, to specify an operation that would run on the first Monday of -2021 and every Monday after that, you would add: - -.. topic:: Example recurring action that runs relative to base date/time - - .. code-block:: xml - - - - .. index:: single: resource; failure recovery single: operation; failure recovery diff --git a/doc/sphinx/Pacemaker_Explained/resources.rst b/doc/sphinx/Pacemaker_Explained/resources.rst index 0c384b1f2bf..16d437f71a1 100644 --- a/doc/sphinx/Pacemaker_Explained/resources.rst +++ b/doc/sphinx/Pacemaker_Explained/resources.rst @@ -82,9 +82,10 @@ Most Linux distributions use `Systemd and service management. *Unit files* specify how to manage services and are usually provided by the distribution. -Pacemaker can manage systemd services. Simply create a resource with -``systemd`` as the resource standard and the unit file name as the resource -type. Do *not* run ``systemctl enable`` on the unit. +Pacemaker can manage systemd units of type service, socket, mount, timer, or +path. Simply create a resource with ``systemd`` as the resource standard and +the unit file name as the resource type. Do *not* run ``systemctl enable`` on +the unit. .. important:: @@ -189,8 +190,7 @@ where to find that resource agent and what standards it conforms to. | | single: description; resource | | | single: resource; property, description | | | | - | | A description of the Resource Agent, intended for local use. | - | | E.g. ``IP address for website`` | + | | Arbitrary text for user's use (ignored by Pacemaker) | +-------------+------------------------------------------------------------------+ | type | .. index:: | | | single: type; resource | diff --git a/doc/sphinx/Pacemaker_Explained/reusing-configuration.rst b/doc/sphinx/Pacemaker_Explained/reusing-configuration.rst index 39f736fcfdb..01c7a974ae4 100644 --- a/doc/sphinx/Pacemaker_Explained/reusing-configuration.rst +++ b/doc/sphinx/Pacemaker_Explained/reusing-configuration.rst @@ -393,7 +393,7 @@ For instance, given the following CIB section: - + diff --git a/doc/sphinx/shared/images/Policy-Engine-big.png b/doc/sphinx/shared/images/Policy-Engine-big.png new file mode 100644 index 00000000000..f2a36418419 Binary files /dev/null and b/doc/sphinx/shared/images/Policy-Engine-big.png differ diff --git a/doc/sphinx/shared/images/Policy-Engine-small.png b/doc/sphinx/shared/images/Policy-Engine-small.png new file mode 100644 index 00000000000..7b8e573d590 Binary files /dev/null and b/doc/sphinx/shared/images/Policy-Engine-small.png differ diff --git a/doc/sphinx/shared/images/pcmk-active-active.png b/doc/sphinx/shared/images/pcmk-active-active.png new file mode 100644 index 00000000000..9039601c41a Binary files /dev/null and b/doc/sphinx/shared/images/pcmk-active-active.png differ diff --git a/doc/sphinx/shared/images/pcmk-active-passive.png b/doc/sphinx/shared/images/pcmk-active-passive.png new file mode 100644 index 00000000000..fa001a12331 Binary files /dev/null and b/doc/sphinx/shared/images/pcmk-active-passive.png differ diff --git a/doc/sphinx/shared/images/pcmk-colocated-sets.png b/doc/sphinx/shared/images/pcmk-colocated-sets.png new file mode 100644 index 00000000000..206f4b1c079 Binary files /dev/null and b/doc/sphinx/shared/images/pcmk-colocated-sets.png differ diff --git a/doc/sphinx/shared/images/pcmk-internals.png b/doc/sphinx/shared/images/pcmk-internals.png new file mode 100644 index 00000000000..a9867fa540e Binary files /dev/null and b/doc/sphinx/shared/images/pcmk-internals.png differ diff --git a/doc/sphinx/shared/images/pcmk-overview.svg b/doc/sphinx/shared/images/pcmk-overview.svg deleted file mode 100644 index 9fb022db44c..00000000000 --- a/doc/sphinx/shared/images/pcmk-overview.svg +++ /dev/null @@ -1,855 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - Pacemaker 10,000ft - - - - - - - - Cluster Resource Manager - - - - - - Local Resource Manager - - - - - Resource Agents - - Messaging & Membership - - - - - - diff --git a/doc/sphinx/shared/images/pcmk-shared-failover.png b/doc/sphinx/shared/images/pcmk-shared-failover.png new file mode 100644 index 00000000000..9c3601a7c41 Binary files /dev/null and b/doc/sphinx/shared/images/pcmk-shared-failover.png differ diff --git a/doc/sphinx/shared/images/pcmk-stack.svg b/doc/sphinx/shared/images/pcmk-stack.svg deleted file mode 100644 index fcbe137cfb4..00000000000 --- a/doc/sphinx/shared/images/pcmk-stack.svg +++ /dev/null @@ -1,925 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - Distributed Lock Manager - - - - Pacemaker - - Pacemaker Stack - Build Dependency - - - - - - - - - - Resource Agents - - - - - Corosync - - - - - - cLVM2 - - - - GFS2 - - - - OCFS2 - - - - - - - diff --git a/doc/sphinx/shared/pacemaker-intro.rst b/doc/sphinx/shared/pacemaker-intro.rst index dbea686b744..c06b4bad3af 100644 --- a/doc/sphinx/shared/pacemaker-intro.rst +++ b/doc/sphinx/shared/pacemaker-intro.rst @@ -73,10 +73,6 @@ open-source cluster filesystems make use of a common *Distributed Lock Manager* (DLM), which makes direct use of Corosync for its messaging and membership capabilities and Pacemaker for the ability to fence nodes. -.. image:: ../shared/images/pcmk-stack.png - :alt: Example cluster stack - :align: center - Pacemaker Architecture ______________________ diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index 7391abba0d4..0705b6570d8 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -437,7 +437,7 @@ should_forget_cluster_node(gpointer key, gpointer value, gpointer user_data) return FALSE; } - crm_info("Removing node with name %s and cluster layer ID " PRIu32 + crm_info("Removing node with name %s and cluster layer ID %" PRIu32 " from membership cache", pcmk__s(node->name, "(unknown)"), node->cluster_layer_id); return TRUE; @@ -1282,7 +1282,7 @@ update_peer_state_iter(const char *source, pcmk__node_status_t *node, * refresh_remote_nodes(). */ if(iter) { - crm_notice("Purged 1 peer with cluster layer ID=" PRIu32 + crm_notice("Purged 1 peer with cluster layer ID %" PRIu32 "and/or name=%s from the membership cache", node->cluster_layer_id, node->name); g_hash_table_iter_remove(iter); diff --git a/lib/common/digest.c b/lib/common/digest.c index 5d02d9cd3d1..0884f5642bd 100644 --- a/lib/common/digest.c +++ b/lib/common/digest.c @@ -25,6 +25,30 @@ #define BEST_EFFORT_STATUS 0 +/* + * Pacemaker uses digests (MD5 hashes) of stringified XML to detect changes in + * the CIB as a whole, a particular resource's agent parameters, and the device + * parameters last used to unfence a particular node. + * + * "v2" digests hash pcmk__xml_string() directly, while less efficient "v1" + * digests do the same with a prefixed space, suffixed newline, and optional + * pre-sorting. + * + * On-disk CIB digests use v1 without sorting. + * + * Operation digests use v1 with sorting, and are stored in a resource's + * operation history in the CIB status section. They come in three flavors: + * - a digest of (nearly) all resource parameters and options, used to detect + * any resource configuration change; + * - a digest of resource parameters marked as nonreloadable, used to decide + * whether a reload or full restart is needed after a configuration change; + * - and a digest of resource parameters not marked as private, used in + * simulations where private parameters have been removed from the input. + * + * Unfencing digests are set as node attributes, and are used to require + * that nodes be unfenced again after a device's configuration changes. + */ + /*! * \internal * \brief Dump XML in a format used with v1 digests diff --git a/python/pacemaker/_cts/clustermanager.py b/python/pacemaker/_cts/clustermanager.py index 09d77617b22..f1683daf7d7 100644 --- a/python/pacemaker/_cts/clustermanager.py +++ b/python/pacemaker/_cts/clustermanager.py @@ -14,6 +14,7 @@ from collections import UserDict from pacemaker.buildoptions import BuildOptions +from pacemaker.exitstatus import ExitStatus from pacemaker._cts.CTS import NodeStatus from pacemaker._cts.audits import AuditResource from pacemaker._cts.cib import ConfigFactory @@ -820,15 +821,16 @@ def has_quorum(self, node_list): if self.expected_status[node] != "up": continue - (_, quorum) = self.rsh(node, self.templates["QuorumCmd"], verbose=1) - quorum = quorum[0].strip() + (rc, quorum) = self.rsh(node, self.templates["QuorumCmd"], verbose=1) + if rc != ExitStatus.OK: + self.debug("WARN: Quorum check on %s returned error (%d)" % (node, rc)) + continue + quorum = quorum[0].strip() if quorum.find("1") != -1: return True - if quorum.find("0") != -1: return False - self.debug("WARN: Unexpected quorum test result from %s:%s" % (node, quorum)) return False diff --git a/python/pacemaker/_cts/patterns.py b/python/pacemaker/_cts/patterns.py index fd3a5990f6f..46051b66686 100644 --- a/python/pacemaker/_cts/patterns.py +++ b/python/pacemaker/_cts/patterns.py @@ -307,7 +307,9 @@ def __init__(self): self._components["pacemaker-controld-ignore"] = [] self._components["pacemaker-attrd"] = [] - self._components["pacemaker-attrd-ignore"] = [] + self._components["pacemaker-attrd-ignore"] = [ + r"pacemaker-controld.*Connection to attrd (IPC failed|closed)", + ] self._components["pacemaker-schedulerd"] = [ r"State transition .* S_RECOVERY", diff --git a/python/pacemaker/buildoptions.py.in b/python/pacemaker/buildoptions.py.in index de9f666f63e..02f5552a438 100644 --- a/python/pacemaker/buildoptions.py.in +++ b/python/pacemaker/buildoptions.py.in @@ -64,6 +64,9 @@ class BuildOptions: OCF_ROOT_DIR = "@PCMK_OCF_ROOT@" """Root directory for OCF resource agents and libraries.""" + PACEMAKER_CONFIG_DIR = "@PACEMAKER_CONFIG_DIR@" + """Where configuration files such as authkey are kept.""" + RSC_TMP_DIR = "@PCMK__OCF_TMP_DIR@" """Where resource agents should keep state files.""" diff --git a/rpm/pacemaker.spec.in b/rpm/pacemaker.spec.in index 2771f474392..1968b9ed0d5 100644 --- a/rpm/pacemaker.spec.in +++ b/rpm/pacemaker.spec.in @@ -298,7 +298,6 @@ BuildRequires: %{pkgname_glue_libs}-devel %if %{with doc} BuildRequires: asciidoc -BuildRequires: inkscape BuildRequires: %{python_name}-sphinx %endif diff --git a/tools/stonith_admin.c b/tools/stonith_admin.c index 39c4f18ef9a..8129708e293 100644 --- a/tools/stonith_admin.c +++ b/tools/stonith_admin.c @@ -528,6 +528,12 @@ main(int argc, char **argv) target = options.unregister_level; } + if ((options.timeout > (UINT_MAX / 1000)) || (options.timeout < 0)) { + out->err(out, "Integer value \"%d\" for -t out of range", options.timeout); + exit_code = CRM_EX_USAGE; + goto done; + } + if (action == 0) { char *help = g_option_context_get_help(context, TRUE, NULL);