관리-도구

편집 파일: plugin_scheduler.py

# code for cores isolation was inspired by Tuna implementation
# perf code was borrowed from kernel/tools/perf/python/twatch.py
# thanks to Arnaldo Carvalho de Melo <acme@redhat.com>

from . import base
from .decorators import *
import tuned.logs
import re
from subprocess import *
import threading
# perf is optional
try:
	import perf
except ImportError:
# if perf is unavailable, it will be disabled later
	pass
import select
import tuned.consts as consts
import procfs
from tuned.utils.commands import commands
import errno
import os
import collections
import math
# Check existence of scheduler API in os module
try:
	os.SCHED_FIFO
except AttributeError:
	import schedutils

log = tuned.logs.get()

class SchedulerParams(object):
	def __init__(self, cmd, cmdline = None, scheduler = None,
			priority = None, affinity = None, cgroup = None):
		self._cmd = cmd
		self.cmdline = cmdline
		self.scheduler = scheduler
		self.priority = priority
		self.affinity = affinity
		self.cgroup = cgroup

@property
	def affinity(self):
		if self._affinity is None:
			return None
		else:
			return self._cmd.bitmask2cpulist(self._affinity)

@affinity.setter
	def affinity(self, value):
		if value is None:
			self._affinity = None
		else:
			self._affinity = self._cmd.cpulist2bitmask(value)

class IRQAffinities(object):
	def __init__(self):
		self.irqs = {}
		self.default = None
		# IRQs that don't support changing CPU affinity:
		self.unchangeable = []

class SchedulerUtils(object):
	"""
	Class encapsulating scheduler implementation in os module
	"""

_dict_schedcfg2schedconst = {
		"f": "SCHED_FIFO",
		"b": "SCHED_BATCH",
		"r": "SCHED_RR",
		"o": "SCHED_OTHER",
		"i": "SCHED_IDLE",
	}

def __init__(self):
		# {"f": os.SCHED_FIFO...}
		self._dict_schedcfg2num = dict((k, getattr(os, name)) for k, name in self._dict_schedcfg2schedconst.items())
		# { os.SCHED_FIFO: "SCHED_FIFO"... }
		self._dict_num2schedconst = dict((getattr(os, name), name) for name in self._dict_schedcfg2schedconst.values())

def sched_cfg_to_num(self, str_scheduler):
		return self._dict_schedcfg2num.get(str_scheduler)

# Reimplementation of schedstr from schedutils for logging purposes
	def sched_num_to_const(self, scheduler):
		return self._dict_num2schedconst.get(scheduler)

def get_scheduler(self, pid):
		return os.sched_getscheduler(pid)

def set_scheduler(self, pid, sched, prio):
		os.sched_setscheduler(pid, sched, os.sched_param(prio))

def get_affinity(self, pid):
		return os.sched_getaffinity(pid)

def set_affinity(self, pid, affinity):
		os.sched_setaffinity(pid, affinity)

def get_priority(self, pid):
		return os.sched_getparam(pid).sched_priority

def get_priority_min(self, sched):
		return os.sched_get_priority_min(sched)

def get_priority_max(self, sched):
		return os.sched_get_priority_max(sched)

class SchedulerUtilsSchedutils(SchedulerUtils):
	"""
	Class encapsulating scheduler implementation in schedutils module
	"""
	def __init__(self):
		# { "f": schedutils.SCHED_FIFO... }
		self._dict_schedcfg2num = dict((k, getattr(schedutils, name)) for k, name in self._dict_schedcfg2schedconst.items())
		# { schedutils.SCHED_FIFO: "SCHED_FIFO"... }
		self._dict_num2schedconst = dict((getattr(schedutils, name), name) for name in self._dict_schedcfg2schedconst.values())

def get_scheduler(self, pid):
		return schedutils.get_scheduler(pid)

def set_scheduler(self, pid, sched, prio):
		schedutils.set_scheduler(pid, sched, prio)

def get_affinity(self, pid):
		return schedutils.get_affinity(pid)

def set_affinity(self, pid, affinity):
		schedutils.set_affinity(pid, affinity)

def get_priority(self, pid):
		return schedutils.get_priority(pid)

def get_priority_min(self, sched):
		return schedutils.get_priority_min(sched)

def get_priority_max(self, sched):
		return schedutils.get_priority_max(sched)

class SchedulerPlugin(base.Plugin):
	r"""
	Allows tuning of scheduling priorities, process/thread/IRQ
	affinities, and CPU isolation.

To prevent processes/threads/IRQs from using certain CPUs, use
	the [option]`isolated_cores` option. It changes process/thread
	affinities, IRQs affinities and it sets `default_smp_affinity`
	for IRQs. The CPU affinity mask is adjusted for all processes and
	threads matching [option]`ps_whitelist` option subject to success
	of the `sched_setaffinity()` system call. The default setting of
	the [option]`ps_whitelist` regular expression is `.*` to match all
	processes and thread names. To exclude certain processes and threads
	use [option]`ps_blacklist` option. The value of this option is also
	interpreted as a regular expression and process/thread names (`ps -eo
	cmd`) are matched against that expression. Profile rollback allows
	all matching processes and threads to run on all CPUs and restores
	the IRQ settings prior to the profile application.

Multiple regular expressions for [option]`ps_whitelist`
	and [option]`ps_blacklist` options are allowed and separated by
	`;`. Quoted semicolon `\;` is taken literally.

.Isolate CPUs 2-4
	====
	----
	[scheduler]
	isolated_cores=2-4
	ps_blacklist=.*pmd.*;.*PMD.*;^DPDK;.*qemu-kvm.*
	----
	Isolate CPUs 2-4 while ignoring processes and threads matching
	`ps_blacklist` regular expressions.
	====

The [option]`irq_process` option controls whether the scheduler plugin
	applies the `isolated_cores` parameter to IRQ affinities. The default
	value is `true`, which means that the scheduler plugin will move all
	possible IRQs away from the isolated cores. When `irq_process` is set
	to `false`, the plugin will not change any IRQ affinities.

The [option]`default_irq_smp_affinity` option controls the values
	*TuneD* writes to `/proc/irq/default_smp_affinity`. The file specifies
	default affinity mask that applies to all non-active IRQs. Once an
	IRQ is allocated/activated its affinity bitmask will be set to the
	default mask.

The following values are supported:

* `calc`
	+
	The content of `/proc/irq/default_smp_affinity` will be calculated
	from the `isolated_cores` parameter. Non-isolated cores
	are calculated as an inversion of the `isolated_cores`. Then
	the intersection of the non-isolated cores and the previous
	content of `/proc/irq/default_smp_affinity` is written to
	`/proc/irq/default_smp_affinity`. If the intersection is
	an empty set, then just the non-isolated cores are written to
	`/proc/irq/default_smp_affinity`. This behavior is the default if
	the parameter `default_irq_smp_affinity` is omitted.

* `ignore`
	+
	*TuneD* will not touch `/proc/irq/default_smp_affinity`.

* an explicit cpulist
	+
	The cpulist (such as `1,3-4`) is unpacked and written directly to
	`/proc/irq/default_smp_affinity`.

.An explicit CPU list to set the default IRQ smp affinity to CPUs 0 and 2
	====
	----
	[scheduler]
	isolated_cores=1,3
	default_irq_smp_affinity=0,2
	----
	====

To adjust scheduling policy, priority and affinity for a group of
	processes/threads, use the following syntax.

[subs="quotes"]
	----
	group.__groupname__=__rule_prio__:__sched__:__prio__:__affinity__:__regex__
	----

Here, `__rule_prio__` defines internal *TuneD* priority of the
	rule. Rules are sorted based on priority. This is needed for
	inheritence to be able to reorder previously defined rules. Equal
	`__rule_prio__` rules should be processed in the order they were
	defined. However, this is Python interpreter dependant. To disable
	an inherited rule for `__groupname__` use:

[subs="quotes"]
	----
	group.__groupname__=
	----

`__sched__` must be one of:
	*`f`* for FIFO,
	*`b`* for batch,
	*`r`* for round robin,
	*`o`* for other,
	*`*`* do not change.

`__affinity__` is CPU affinity in hexadecimal. Use `*` for no change.

`__prio__` scheduling priority (see `chrt -m`).

`__regex__` is Python regular expression. It is matched against the output of:

[subs="quotes"]
	----
	ps -eo cmd
	----

Any given process name may match more than one group. In such a case,
	the priority and scheduling policy are taken from the last matching
	`__regex__`.

.Setting scheduling policy and priorities to kernel threads and watchdog
	====
	----
	[scheduler]
	group.kthreads=0:*:1:*:\[.*\]$
	group.watchdog=0:f:99:*:\[watchdog.*\]
	----
	====

The scheduler plug-in uses perf event loop to catch newly created
	processes. By default it listens to `perf.RECORD_COMM` and
	`perf.RECORD_EXIT` events. By setting [option]`perf_process_fork`
	option to `true`, `perf.RECORD_FORK` events will be also listened
	to. In other words, child processes created by the `fork()` system
	call will be processed. Since child processes inherit CPU affinity
	from their parents, the scheduler plug-in usually does not need to
	explicitly process these events. As processing perf events can
	pose a significant CPU overhead, the [option]`perf_process_fork`
	option parameter is set to `false` by default. Due to this, child
	processes are not processed by the scheduler plug-in.

The CPU overhead of the scheduler plugin can be mitigated by using
	the scheduler [option]`runtime` option and setting it to `0`. This
	will completely disable the dynamic scheduler functionality and the
	perf events will not be monitored and acted upon. The disadvantage
	ot this approach is the procees/thread tuning will be done only at
	profile application.

.Disabling the scheduler dynamic functionality
	====
	----
	[scheduler]
	runtime=0
	isolated_cores=1,3
	----
	====

NOTE: For perf events, memory mapped buffer is used. Under heavy load
	the buffer may overflow. In such cases the `scheduler` plug-in
	may start missing events and failing to process some newly created
	processes. Increasing the buffer size may help. The buffer size can
	be set with the [option]`perf_mmap_pages` option. The value of this
	parameter has to expressed in powers of 2. If it is not the power
	of 2, the nearest higher power of 2 value is calculated from it
	and this calculated value used. If the [option]`perf_mmap_pages`
	option is omitted, the default kernel value is used.

The scheduler plug-in supports process/thread confinement using
	cgroups v1.

[option]`cgroup_mount_point` option specifies the path to mount the
	cgroup filesystem or where *TuneD* expects it to be mounted. If unset,
	`/sys/fs/cgroup/cpuset` is expected.

If [option]`cgroup_groups_init` option is set to `1` *TuneD*
	will create (and remove) all cgroups defined with the `cgroup*`
	options. This is the default behavior. If it is set to `0` the
	cgroups need to be preset by other means.

If [option]`cgroup_mount_point_init` option is set to `1`,
	*TuneD* will create (and remove) the cgroup mountpoint. It implies
	`cgroup_groups_init = 1`. If set to `0` the cgroups mount point
	needs to be preset by other means. This is the default behavior.

The [option]`cgroup_for_isolated_cores` option is the cgroup
	name used for the [option]`isolated_cores` option functionality. For
	example, if a system has 4 CPUs, `isolated_cores=1` means that all
	processes/threads will be moved to CPUs 0,2-3.
	The scheduler plug-in will isolate the specified core by writing
	the calculated CPU affinity to the `cpuset.cpus` control file of
	the specified cgroup and move all the matching processes/threads to
	this group. If this option is unset, classic cpuset affinity using
	`sched_setaffinity()` will be used.

[option]`cgroup.__cgroup_name__` option defines affinities for
	arbitrary cgroups. Even hierarchic cgroups can be used, but the
	hieararchy needs to be specified in the correct order. Also *TuneD*
	does not do any sanity checks here, with the exception that it forces
	the cgroup to be under [option]`cgroup_mount_point`.

The syntax of the scheduler option starting with `group.` has been
	augmented to use `cgroup.__cgroup_name__` instead of the hexadecimal
	`__affinity__`. The matching processes will be moved to the cgroup
	`__cgroup_name__`. It is also possible to use cgroups which have
	not been defined by the [option]`cgroup.` option as described above,
	i.e. cgroups not managed by *TuneD*.

All cgroup names are sanitized by replacing all all dots (`.`) with
	slashes (`/`). This is to prevent the plug-in from writing outside
	[option]`cgroup_mount_point`.

.Using cgroups v1 with the scheduler plug-in
	====
	----
	[scheduler]
	cgroup_mount_point=/sys/fs/cgroup/cpuset
	cgroup_mount_point_init=1
	cgroup_groups_init=1
	cgroup_for_isolated_cores=group
	cgroup.group1=2
	cgroup.group2=0,2
	
	group.ksoftirqd=0:f:2:cgroup.group1:ksoftirqd.*
	ps_blacklist=ksoftirqd.*;rcuc.*;rcub.*;ktimersoftd.*
	isolated_cores=1
	----
	Cgroup `group1` has the affinity set to CPU 2 and the cgroup `group2`
	to CPUs 0,2. Given a 4 CPU setup, the [option]`isolated_cores=1`
	option causes all processes/threads to be moved to CPU
	cores 0,2-3. Processes/threads that are blacklisted by the
	[option]`ps_blacklist` regular expression will not be moved.
	
	The scheduler plug-in will isolate the specified core by writing the
	CPU affinity 0,2-3 to the `cpuset.cpus` control file of the `group`
	and move all the matching processes/threads to this cgroup.
	====

Option [option]`cgroup_ps_blacklist` allows excluding processes
	which belong to the blacklisted cgroups. The regular expression specified
	by this option is matched against cgroup hierarchies from
	`/proc/PID/cgroups`. Cgroups v1 hierarchies from `/proc/PID/cgroups`
	are separated by commas ',' prior to regular expression matching. The
	following is an example of content against which the regular expression
	is matched against: `10:hugetlb:/,9:perf_event:/,8:blkio:/`

Multiple regular expressions can be separated by semicolon ';'. The
	semicolon represents a logical 'or' operator.

.Cgroup-based exclusion of processes from the scheduler
	====
	----
	[scheduler]
	isolated_cores=1
	cgroup_ps_blacklist=:/daemons\b
	----
	The scheduler plug-in will move all processes away from core 1 except processes which
	belong to cgroup '/daemons'. The '\b' is a regular expression
	metacharacter that matches a word boundary.
	----
	[scheduler]
	isolated_cores=1
	cgroup_ps_blacklist=\b8:blkio:
	----
	The scheduler plug-in will exclude all processes which belong to a cgroup
	with hierarchy-ID 8 and controller-list blkio.
	====

Kernels 5.13 and newer moved some `sched_` and `numa_balancing_` kernel run-time
	parameters from `/proc/sys/kernel`, managed by the `sysctl` utility, to
	`debugfs`, typically mounted under `/sys/kernel/debug`.  TuneD provides an
	abstraction mechanism for the following parameters via the scheduler plug-in:
	[option]`sched_min_granularity_ns`, [option]`sched_latency_ns`,
	[option]`sched_wakeup_granularity_ns`, [option]`sched_tunable_scaling`,
	[option]`sched_migration_cost_ns`, [option]`sched_nr_migrate`,
	[option]`numa_balancing_scan_delay_ms`,
	[option]`numa_balancing_scan_period_min_ms`,
	[option]`numa_balancing_scan_period_max_ms` and
	[option]`numa_balancing_scan_size_mb`.
	Moreover in kernel 6.6 and newer support for the `sched_wakeup_granularity_ns` and
	`sched_latency_ns` were removed. The `sched_min_granularity_ns` was renamed to
	`sched_base_slice_ns`. Based on the kernel used, TuneD will write the specified
	value to the correct location or ignore it. For the compatibility the alias
	[option]`sched_base_slice_ns` was added, but the [option]`sched_min_granularity_ns`
	can be still used instead.

.Set tasks' "cache hot" value for migration decisions.
	====
	----
	[scheduler]
	sched_migration_cost_ns=500000
	----
	On the old kernels, this is equivalent to:
	----
	[sysctl]
	kernel.sched_migration_cost_ns=500000
	----
	that is, value `500000` will be written to `/proc/sys/kernel/sched_migration_cost_ns`.
	However, on more recent kernels, the value `500000` will be written to
	`/sys/kernel/debug/sched/migration_cost_ns`.
	====
	"""

_dict_sched_knob_map = {
		"wakeup_granularity_ns": "",
		"min_granularity_ns": "base_slice_ns",
		"latency_ns": "",
	}

def __init__(self, monitor_repository, storage_factory, hardware_inventory, device_matcher, device_matcher_udev, plugin_instance_factory, global_cfg, variables):
		super(SchedulerPlugin, self).__init__(monitor_repository, storage_factory, hardware_inventory, device_matcher, device_matcher_udev, plugin_instance_factory, global_cfg, variables)
		self._has_dynamic_options = True
		self._daemon = consts.CFG_DEF_DAEMON
		self._sleep_interval = int(consts.CFG_DEF_SLEEP_INTERVAL)
		if global_cfg is not None:
			self._daemon = global_cfg.get_bool(consts.CFG_DAEMON, consts.CFG_DEF_DAEMON)
			self._sleep_interval = int(global_cfg.get(consts.CFG_SLEEP_INTERVAL, consts.CFG_DEF_SLEEP_INTERVAL))
		self._cmd = commands()
		# helper variable utilized for showing hint only once that the error may be caused by Secure Boot
		self._secure_boot_hint = None
		# paths cache for sched_ and numa_ tunings
		self._sched_knob_paths_cache = {}
		# default is to whitelist all and blacklist none
		self._ps_whitelist = ".*"
		self._ps_blacklist = ""
		self._kthread_process = True
		self._cgroup_ps_blacklist_re = ""
		# perf is optional, if unavailable, it will be disabled later
		try:
			self._cpus = perf.cpu_map()
		except (NameError, AttributeError):
			cpus = self._cmd.read_file(consts.SYSFS_CPUS_PRESENT_PATH)
			# it's different type than perf.cpu_map(), but without perf we use it as iterable
			# which should be compatible, fallback to single core CPU if sysfs is unavailable
			self._cpus = self._cmd.cpulist_unpack(cpus) if cpus else [ 0 ]

self._scheduler_storage_key = self._storage_key(
				command_name = "scheduler")
		self._irq_process = True
		self._irq_storage_key = self._storage_key(
				command_name = "irq")
		self._evlist = None
		try:
			self._scheduler_utils = SchedulerUtils()
		except AttributeError:
			self._scheduler_utils = SchedulerUtilsSchedutils()

def _calc_mmap_pages(self, mmap_pages):
		if mmap_pages is None:
			return None
		try:
			mp = int(mmap_pages)
		except ValueError:
			return 0
		if mp <= 0:
			return 0
		# round up to the nearest power of two value
		return int(2 ** math.ceil(math.log(mp, 2)))

def _instance_init(self, instance):
		instance._evlist = None
		instance._has_dynamic_tuning = False
		instance._has_static_tuning = True
		# this is hack, runtime_tuning should be covered by dynamic_tuning configuration
		# TODO: add per plugin dynamic tuning configuration and use dynamic_tuning configuration
		# instead of runtime_tuning
		instance._runtime_tuning = True

# FIXME: do we want to do this here?
		# recover original values in case of crash
		self._scheduler_original = self._storage.get(
				self._scheduler_storage_key, {})
		if len(self._scheduler_original) > 0:
			log.info("recovering scheduling settings from previous run")
			self._restore_ps_affinity()
			self._scheduler_original = {}
			self._storage.unset(self._scheduler_storage_key)

self._cgroups_original_affinity = dict()

# calculated by isolated_cores setter
		self._affinity = None

self._cgroup_affinity_initialized = False
		self._cgroup = None
		self._cgroups = collections.OrderedDict([(self._sanitize_cgroup_path(option[7:]), self._variables.expand(affinity))
			for option, affinity in instance.options.items() if option[:7] == "cgroup." and len(option) > 7])

instance._scheduler = instance.options

perf_mmap_pages_raw = self._variables.expand(instance.options["perf_mmap_pages"])
		perf_mmap_pages = self._calc_mmap_pages(perf_mmap_pages_raw)
		if perf_mmap_pages == 0:
			log.error("Invalid 'perf_mmap_pages' value specified: '%s', using default kernel value" % perf_mmap_pages_raw)
			perf_mmap_pages = None
		if perf_mmap_pages is not None and str(perf_mmap_pages) != perf_mmap_pages_raw:
			log.info("'perf_mmap_pages' value has to be power of two, specified: '%s', using: '%d'" %
				(perf_mmap_pages_raw, perf_mmap_pages))
		for k in instance._scheduler:
			instance._scheduler[k] = self._variables.expand(instance._scheduler[k])
		if self._cmd.get_bool(instance._scheduler.get("runtime", 1)) == "0":
			instance._runtime_tuning = False
		instance._terminate = threading.Event()
		if self._daemon and instance._runtime_tuning:
			try:
				instance._threads = perf.thread_map()
				evsel = perf.evsel(type = perf.TYPE_SOFTWARE,
					config = perf.COUNT_SW_DUMMY,
					task = 1, comm = 1, mmap = 0, freq = 0,
					wakeup_events = 1, watermark = 1,
					sample_type = perf.SAMPLE_TID | perf.SAMPLE_CPU)
				evsel.open(cpus = self._cpus, threads = instance._threads)
				instance._evlist = perf.evlist(self._cpus, instance._threads)
				instance._evlist.add(evsel)
				if perf_mmap_pages is None:
					instance._evlist.mmap()
				else:
					instance._evlist.mmap(pages = perf_mmap_pages)
			# no perf
			except:
				log.warning("python-perf unavailable, disabling perf support and " \
					"runtime tuning, you can try to (re)install python(3)-perf package")
				instance._runtime_tuning = False

def _instance_cleanup(self, instance):
		if instance._evlist:
			for fd in instance._evlist.get_pollfd():
				os.close(fd.name)

@classmethod
	def _get_config_options(cls):
		return {
			"isolated_cores": None,
			"cgroup_mount_point": consts.DEF_CGROUP_MOUNT_POINT,
			"cgroup_mount_point_init": False,
			"cgroup_groups_init": True,
			"cgroup_for_isolated_cores": None,
			"cgroup_ps_blacklist": None,
			"ps_whitelist": None,
			"ps_blacklist": None,
			"kthread_process": True,
			"irq_process": True,
			"default_irq_smp_affinity": "calc",
			"perf_mmap_pages": None,
			"perf_process_fork": "false",
			"sched_min_granularity_ns": None,
			"sched_base_slice_ns": None,
			"sched_latency_ns": None,
			"sched_wakeup_granularity_ns": None,
			"sched_tunable_scaling": None,
			"sched_migration_cost_ns": None,
			"sched_nr_migrate": None,
			"numa_balancing_scan_delay_ms": None,
			"numa_balancing_scan_period_min_ms": None,
			"numa_balancing_scan_period_max_ms": None,
			"numa_balancing_scan_size_mb": None
		}

def _sanitize_cgroup_path(self, value):
		return str(value).replace(".", "/") if value is not None else None

# Raises OSError, IOError
	def _get_cmdline(self, process):
		if not isinstance(process, procfs.process):
			pid = process
			process = procfs.process(pid)
		cmdline = procfs.process_cmdline(process)
		if self._is_kthread(process):
			cmdline = "[" + cmdline + "]"
		return cmdline

# Raises OSError, IOError
	def get_processes(self):
		ps = procfs.pidstats()
		ps.reload_threads()
		processes = {}
		for proc in ps.values():
			try:
				if not self._kthread_process and self._is_kthread(proc):
					continue
				cmd = self._get_cmdline(proc)
				pid = proc["pid"]
				processes[pid] = cmd
				if "threads" in proc:
					for pid in proc["threads"].keys():
						cmd = self._get_cmdline(proc)
						processes[pid] = cmd
			except (OSError, IOError) as e:
				if e.errno == errno.ENOENT \
						or e.errno == errno.ESRCH:
					continue
				else:
					raise
		return processes

# Raises OSError
	# Raises SystemError with old (pre-0.4) python-schedutils
	# instead of OSError
	# If PID doesn't exist, errno == ESRCH
	def _get_rt(self, pid):
		scheduler = self._scheduler_utils.get_scheduler(pid)
		sched_str = self._scheduler_utils.sched_num_to_const(scheduler)
		priority = self._scheduler_utils.get_priority(pid)
		log.debug("Read scheduler policy '%s' and priority '%d' of PID '%d'"
				% (sched_str, priority, pid))
		return (scheduler, priority)

def _set_rt(self, pid, sched, prio):
		sched_str = self._scheduler_utils.sched_num_to_const(sched)
		log.debug("Setting scheduler policy to '%s' and priority to '%d' of PID '%d'."
				% (sched_str, prio, pid))
		try:
			prio_min = self._scheduler_utils.get_priority_min(sched)
			prio_max = self._scheduler_utils.get_priority_max(sched)
			if prio < prio_min or prio > prio_max:
				log.error("Priority for %s must be in range %d - %d. '%d' was given."
						% (sched_str, prio_min,
						prio_max, prio))
		# Workaround for old (pre-0.4) python-schedutils which raised
		# SystemError instead of OSError
		except (SystemError, OSError) as e:
			log.error("Failed to get allowed priority range: %s"
					% e)
		try:
			self._scheduler_utils.set_scheduler(pid, sched, prio)
		except (SystemError, OSError) as e:
			if hasattr(e, "errno") and e.errno == errno.ESRCH:
				log.debug("Failed to set scheduling parameters of PID %d, the task vanished."
						% pid)
			else:
				log.error("Failed to set scheduling parameters of PID %d: %s"
						% (pid, e))

# process is a procfs.process object
	# Raises OSError, IOError
	def _is_kthread(self, process):
		return process["stat"]["flags"] & procfs.pidstat.PF_KTHREAD != 0

def _process_in_blacklisted_cgroup(self, process):
		if self._cgroup_ps_blacklist_re == "":
			return False
		return re.search(self._cgroup_ps_blacklist_re, self._get_stat_cgroup(process)) is not None

# Returns True if we can ignore a failed affinity change of
	# a process with the given PID and therefore not report it as an error.
	def _ignore_set_affinity_error(self, process):
		pid = process.pid
		try:
			if process["stat"]["state"] == "Z":
				log.debug("Affinity of zombie task with PID %d could not be changed."
						% pid)
				return True
			if self._process_in_blacklisted_cgroup(process):
				log.debug("Affinity of task with PID %d could not be changed, the task was moved into a blacklisted cgroup."
						% pid)
				return True
			if process["stat"].is_bound_to_cpu():
				if self._is_kthread(process):
					log.debug("Affinity of kernel thread with PID %d cannot be changed, the task's affinity mask is fixed."
							% pid)
				else:
					log.warning("Affinity of task with PID %d cannot be changed, the task's affinity mask is fixed."
							% pid)
				return True
			log.info("Task %d cmdline: %s" % (pid, self._get_cmdline(process)))
			log.info("Task %d cgroup: %s" % (pid, self._get_stat_cgroup(process)))
			log.info("Task %d affinity: %s" % (pid, list(self._scheduler_utils.get_affinity(pid))))
		except (OSError, IOError) as e:
			if e.errno == errno.ENOENT or e.errno == errno.ESRCH:
				log.debug("Failed to get task info for PID %d, the task vanished."
						% pid)
				return True
			log.error("Failed to get task info for PID %d: %s"
					% (pid, e))
		except (AttributeError, KeyError) as e:
			log.error("Failed to get task info for PID %d: %s"
					% (pid, e))
		return False

def _store_orig_process_rt(self, pid, scheduler, priority):
		try:
			params = self._scheduler_original[pid]
		except KeyError:
			params = SchedulerParams(self._cmd)
			self._scheduler_original[pid] = params
		if params.scheduler is None and params.priority is None:
			params.scheduler = scheduler
			params.priority = priority

def _tune_process_rt(self, pid, sched, prio):
		cont = True
		if sched is None and prio is None:
			return cont
		try:
			(prev_sched, prev_prio) = self._get_rt(pid)
			if sched is None:
				sched = prev_sched
			self._set_rt(pid, sched, prio)
			self._store_orig_process_rt(pid, prev_sched, prev_prio)
		except (SystemError, OSError) as e:
			if hasattr(e, "errno") and e.errno == errno.ESRCH:
				log.debug("Failed to read scheduler policy of PID %d, the task vanished."
						% pid)
				if pid in self._scheduler_original:
					del self._scheduler_original[pid]
				cont = False
			else:
				log.error("Refusing to set scheduler and priority of PID %d, reading original scheduling parameters failed: %s"
						% (pid, e))
		return cont

def _is_cgroup_affinity(self, affinity):
		return str(affinity)[:7] == "cgroup."

def _store_orig_process_affinity(self, pid, affinity, is_cgroup = False):
		try:
			params = self._scheduler_original[pid]
		except KeyError:
			params = SchedulerParams(self._cmd)
			self._scheduler_original[pid] = params
		if params.affinity is None and params.cgroup is None:
			if is_cgroup:
				params.cgroup = affinity
			else:
				params.affinity = affinity

def _get_cgroup_affinity(self, pid):
		# we cannot use procfs, because it uses comma ',' delimiter which
		# can be ambiguous
		for l in self._cmd.read_file("%s/%s/%s" % (consts.PROCFS_MOUNT_POINT, str(pid), "cgroup"), no_error = True).split("\n"):
			try:
				cgroup = l.split(":cpuset:")[1][1:]
				return cgroup if cgroup != "" else "/"
			except IndexError:
				pass
		return "/"

# it can be arbitrary cgroup even cgroup we didn't set, but it needs to be
	# under "cgroup_mount_point"
	def _set_cgroup(self, pid, cgroup):
		cgroup = self._sanitize_cgroup_path(cgroup)
		path = self._cgroup_mount_point
		if cgroup != "/":
			path = "%s/%s" % (path, cgroup)
		self._cmd.write_to_file("%s/tasks" % path, str(pid), no_error = True)

def _parse_cgroup_affinity(self, cgroup):
		# "cgroup.CGROUP"
		cgroup = cgroup[7:]
		# this should be faster than string comparison
		is_cgroup = not isinstance(cgroup, list) and len(cgroup) > 0
		return is_cgroup, cgroup

def _tune_process_affinity(self, pid, affinity, intersect = False):
		cont = True
		if affinity is None:
			return cont
		try:
			(is_cgroup, cgroup) = self._parse_cgroup_affinity(affinity)
			if is_cgroup:
				prev_affinity = self._get_cgroup_affinity(pid)
				self._set_cgroup(pid, cgroup)
			else:
				prev_affinity = self._get_affinity(pid)
				if intersect:
					affinity = self._get_intersect_affinity(
							prev_affinity, affinity,
							affinity)
				self._set_affinity(pid, affinity)
			self._store_orig_process_affinity(pid,
					prev_affinity, is_cgroup)
		except (SystemError, OSError) as e:
			if hasattr(e, "errno") and e.errno == errno.ESRCH:
				log.debug("Failed to read affinity of PID %d, the task vanished."
						% pid)
				if pid in self._scheduler_original:
					del self._scheduler_original[pid]
				cont = False
			else:
				log.error("Refusing to set CPU affinity of PID %d, reading original affinity failed: %s"
						% (pid, e))
		return cont

#tune process and store previous values
	def _tune_process(self, pid, cmd, sched, prio, affinity):
		cont = self._tune_process_rt(pid, sched, prio)
		if not cont:
			return
		cont = self._tune_process_affinity(pid, affinity)
		if not cont or pid not in self._scheduler_original:
			return
		self._scheduler_original[pid].cmdline = cmd

def _convert_sched_params(self, str_scheduler, str_priority):
		scheduler = self._scheduler_utils.sched_cfg_to_num(str_scheduler)
		if scheduler is None and str_scheduler != "*":
			log.error("Invalid scheduler: %s. Scheduler and priority will be ignored."
					% str_scheduler)
			return (None, None)
		else:
			try:
				priority = int(str_priority)
			except ValueError:
				log.error("Invalid priority: %s. Scheduler and priority will be ignored."
							% str_priority)
				return (None, None)
		return (scheduler, priority)

def _convert_affinity(self, str_affinity):
		if str_affinity == "*":
			affinity = None
		elif self._is_cgroup_affinity(str_affinity):
			affinity = str_affinity
		else:
			affinity = self._cmd.hex2cpulist(str_affinity)
			if not affinity:
				log.error("Invalid affinity: %s. It will be ignored."
						% str_affinity)
				affinity = None
		return affinity

def _convert_sched_cfg(self, vals):
		(rule_prio, scheduler, priority, affinity, regex) = vals
		(scheduler, priority) = self._convert_sched_params(
				scheduler, priority)
		affinity = self._convert_affinity(affinity)
		return (rule_prio, scheduler, priority, affinity, regex)

def _cgroup_create_group(self, cgroup):
		path = "%s/%s" % (self._cgroup_mount_point, cgroup)
		try:
			os.mkdir(path, consts.DEF_CGROUP_MODE)
		except OSError as e:
			log.error("Unable to create cgroup '%s': %s" % (path, e))
		if (not self._cmd.write_to_file("%s/%s" % (path, "cpuset.mems"),
				self._cmd.read_file("%s/%s" % (self._cgroup_mount_point, "cpuset.mems"), no_error = True),
				no_error = True)):
					log.error("Unable to initialize 'cpuset.mems ' for cgroup '%s'" % path)

def _cgroup_initialize_groups(self):
		if self._cgroup is not None and not self._cgroup in self._cgroups:
			self._cgroup_create_group(self._cgroup)
		for cg in self._cgroups:
			self._cgroup_create_group(cg)

def _cgroup_initialize(self):
		log.debug("Initializing cgroups settings")
		try:
			os.makedirs(self._cgroup_mount_point, consts.DEF_CGROUP_MODE)
		except OSError as e:
			log.error("Unable to create cgroup mount point: %s" % e)
		(ret, out) = self._cmd.execute(["mount", "-t", "cgroup", "-o", "cpuset", "cpuset", self._cgroup_mount_point])
		if ret != 0:
			log.error("Unable to mount '%s'" % self._cgroup_mount_point)

def _remove_dir(self, cgroup):
		try:
			os.rmdir(cgroup)
		except OSError as e:
			log.error("Unable to remove directory '%s': %s" % (cgroup, e))

def _cgroup_finalize_groups(self):
		for cg in reversed(self._cgroups):
			self._remove_dir("%s/%s" % (self._cgroup_mount_point, cg))
		if self._cgroup is not None and not self._cgroup in self._cgroups:
			self._remove_dir("%s/%s" % (self._cgroup_mount_point, self._cgroup))

def _cgroup_finalize(self):
		log.debug("Removing cgroups settings")
		(ret, out) = self._cmd.execute(["umount", self._cgroup_mount_point])
		if ret != 0:
			log.error("Unable to umount '%s'" % self._cgroup_mount_point)
			return False
		self._remove_dir(self._cgroup_mount_point)
		d = os.path.dirname(self._cgroup_mount_point)
		if (d != "/"):
			self._remove_dir(d)

def _cgroup_set_affinity_one(self, cgroup, affinity, backup = False):
		if affinity != "":
			log.debug("Setting cgroup '%s' affinity to '%s'" % (cgroup, affinity))
		else:
			log.debug("Skipping cgroup '%s', empty affinity requested" % cgroup)
			return
		path = "%s/%s/%s" % (self._cgroup_mount_point, cgroup, "cpuset.cpus")
		if backup:
			orig_affinity = self._cmd.read_file(path, err_ret = "ERR", no_error = True).strip()
			if orig_affinity != "ERR":
				self._cgroups_original_affinity[cgroup] = orig_affinity
			else:
				log.error("Refusing to set affinity of cgroup '%s', reading original affinity failed" % cgroup)
				return
		if not self._cmd.write_to_file(path, affinity, no_error = True):
			log.error("Unable to set affinity '%s' for cgroup '%s'" % (affinity, cgroup))

def _cgroup_set_affinity(self):
		if self._cgroup_affinity_initialized:
			return
		log.debug("Setting cgroups affinities")
		if self._affinity is not None and self._cgroup is not None and not self._cgroup in self._cgroups:
			self._cgroup_set_affinity_one(self._cgroup, self._affinity, backup = True)
		for cg in self._cgroups.items():
			self._cgroup_set_affinity_one(cg[0], cg[1], backup = True)
		self._cgroup_affinity_initialized = True

def _cgroup_restore_affinity(self):
		log.debug("Restoring cgroups affinities")
		for cg in self._cgroups_original_affinity.items():
			self._cgroup_set_affinity_one(cg[0], cg[1])

def _instance_apply_static(self, instance):
		# need to get "cgroup_mount_point_init", "cgroup_mount_point", "cgroup_groups_init",
		# "cgroup", and initialize mount point and cgroups before super class implementation call
		self._cgroup_mount_point = self._variables.expand(instance.options["cgroup_mount_point"])
		self._cgroup_mount_point_init = self._cmd.get_bool(self._variables.expand(
			instance.options["cgroup_mount_point_init"])) == "1"
		self._cgroup_groups_init = self._cmd.get_bool(self._variables.expand(
			instance.options["cgroup_groups_init"])) == "1"
		self._cgroup = self._sanitize_cgroup_path(self._variables.expand(
			instance.options["cgroup_for_isolated_cores"]))

if self._cgroup_mount_point_init:
			self._cgroup_initialize()
		if self._cgroup_groups_init or self._cgroup_mount_point_init:
			self._cgroup_initialize_groups()

super(SchedulerPlugin, self)._instance_apply_static(instance)

self._cgroup_set_affinity()
		try:
			ps = self.get_processes()
		except (OSError, IOError) as e:
			log.error("error applying tuning, cannot get information about running processes: %s"
					% e)
			return
		sched_cfg = [(option, str(value).split(":", 4)) for option, value in instance._scheduler.items()]
		buf = [(option, self._convert_sched_cfg(vals))
				for option, vals in sched_cfg
				if re.match(r"group\.", option)
				and len(vals) == 5]
		sched_cfg = sorted(buf, key=lambda option_vals: option_vals[1][0])
		sched_all = dict()
		# for runtime tuning
		instance._sched_lookup = {}
		for option, (rule_prio, scheduler, priority, affinity, regex) \
				in sched_cfg:
			try:
				r = re.compile(regex)
			except re.error as e:
				log.error("error compiling regular expression: '%s'" % str(regex))
				continue
			processes = [(pid, cmd) for pid, cmd in ps.items() if re.search(r, cmd) is not None]
			#cmd - process name, option - group name
			sched = dict([(pid, (cmd, option, scheduler, priority, affinity, regex))
					for pid, cmd in processes])
			sched_all.update(sched)
			# make any contained regexes non-capturing: replace "(" with "(?:",
			# unless the "(" is preceded by "\" or followed by "?"
			regex = re.sub(r"(?<!\\)\((?!\?)", "(?:", str(regex))
			instance._sched_lookup[regex] = [scheduler, priority, affinity]
		for pid, (cmd, option, scheduler, priority, affinity, regex) \
				in sched_all.items():
			self._tune_process(pid, cmd, scheduler,
					priority, affinity)
		self._storage.set(self._scheduler_storage_key,
				self._scheduler_original)
		if self._daemon and instance._runtime_tuning:
			instance._thread = threading.Thread(target = self._thread_code, args = [instance])
			instance._thread.start()

def _restore_ps_affinity(self):
		try:
			ps = self.get_processes()
		except (OSError, IOError) as e:
			log.error("error unapplying tuning, cannot get information about running processes: %s"
					% e)
			return
		for pid, orig_params in self._scheduler_original.items():
			# if command line for the pid didn't change, it's very probably the same process
			if pid not in ps or ps[pid] != orig_params.cmdline:
				continue
			if orig_params.scheduler is not None \
					and orig_params.priority is not None:
				self._set_rt(pid, orig_params.scheduler,
						orig_params.priority)
			if orig_params.cgroup is not None:
				self._set_cgroup(pid, orig_params.cgroup)
			elif orig_params.affinity is not None:
				self._set_affinity(pid, orig_params.affinity)
		self._scheduler_original = {}
		self._storage.unset(self._scheduler_storage_key)

def _cgroup_cleanup_tasks_one(self, cgroup):
		cnt = int(consts.CGROUP_CLEANUP_TASKS_RETRY)
		data = " "
		while data != "" and cnt > 0:
			data = self._cmd.read_file("%s/%s/%s" % (self._cgroup_mount_point, cgroup, "tasks"),
				err_ret = " ", no_error = True)
			if data not in ["", " "]:
				for l in data.split("\n"):
					self._cmd.write_to_file("%s/%s" % (self._cgroup_mount_point, "tasks"), l, no_error = True)
			cnt -= 1
		if cnt == 0:
			log.warning("Unable to cleanup tasks from cgroup '%s'" % cgroup)

def _cgroup_cleanup_tasks(self):
		if self._cgroup is not None and not self._cgroup in self._cgroups:
			self._cgroup_cleanup_tasks_one(self._cgroup)
		for cg in self._cgroups:
			self._cgroup_cleanup_tasks_one(cg)

def _instance_unapply_static(self, instance, rollback = consts.ROLLBACK_SOFT):
		super(SchedulerPlugin, self)._instance_unapply_static(instance, rollback)
		if self._daemon and instance._runtime_tuning:
			instance._terminate.set()
			instance._thread.join()
		self._restore_ps_affinity()
		self._cgroup_restore_affinity()
		self._cgroup_cleanup_tasks()
		if self._cgroup_groups_init or self._cgroup_mount_point_init:
			self._cgroup_finalize_groups()
		if self._cgroup_mount_point_init:
			self._cgroup_finalize()

def _cgroup_verify_affinity_one(self, cgroup, affinity):
		log.debug("Verifying cgroup '%s' affinity" % cgroup)
		path = "%s/%s/%s" % (self._cgroup_mount_point, cgroup, "cpuset.cpus")
		current_affinity = self._cmd.read_file(path, err_ret = "ERR", no_error = True)
		if current_affinity == "ERR":
			return True
		current_affinity = self._cmd.cpulist2string(self._cmd.cpulist_pack(current_affinity))
		affinity = self._cmd.cpulist2string(self._cmd.cpulist_pack(affinity))
		affinity_description = "cgroup '%s' affinity" % cgroup
		if current_affinity == affinity:
			log.info(consts.STR_VERIFY_PROFILE_VALUE_OK
					% (affinity_description, current_affinity))
			return True
		else:
			log.error(consts.STR_VERIFY_PROFILE_VALUE_FAIL
					% (affinity_description, current_affinity,
					affinity))
			return False

def _cgroup_verify_affinity(self):
		log.debug("Veryfying cgroups affinities")
		ret = True
		if self._affinity is not None and self._cgroup is not None and not self._cgroup in self._cgroups:
			ret = ret and self._cgroup_verify_affinity_one(self._cgroup, self._affinity)
		for cg in self._cgroups.items():
			ret = ret and self._cgroup_verify_affinity_one(cg[0], cg[1])
		return ret

def _instance_verify_static(self, instance, ignore_missing, devices):
		ret1 = super(SchedulerPlugin, self)._instance_verify_static(instance, ignore_missing, devices)
		ret2 = self._cgroup_verify_affinity()
		return ret1 and ret2

def _add_pid(self, instance, pid, r):
		try:
			proc = procfs.process(pid)
			if not self._kthread_process and self._is_kthread(proc):
				return
			cmd = self._get_cmdline(pid)
		except (OSError, IOError) as e:
			if e.errno == errno.ENOENT \
					or e.errno == errno.ESRCH:
				log.debug("Failed to get cmdline of PID %d, the task vanished."
						% pid)
			else:
				log.error("Failed to get cmdline of PID %d: %s"
						% (pid, e))
			return
		v = self._cmd.re_lookup(instance._sched_lookup, cmd, r)
		if v is not None and not pid in self._scheduler_original:
			log.debug("tuning new process '%s' with PID '%d' by '%s'" % (cmd, pid, str(v)))
			(sched, prio, affinity) = v
			self._tune_process(pid, cmd, sched, prio,
					affinity)
			self._storage.set(self._scheduler_storage_key,
					self._scheduler_original)

def _remove_pid(self, instance, pid):
		if pid in self._scheduler_original:
			del self._scheduler_original[pid]
			log.debug("removed PID %d from the rollback database" % pid)
			self._storage.set(self._scheduler_storage_key,
					self._scheduler_original)

def _thread_code(self, instance):
		r = self._cmd.re_lookup_compile(instance._sched_lookup)
		poll = select.poll()
		# Store the file objects in a local variable so that they don't
		# go out of scope too soon. This is a workaround for
		# python3-perf bug rhbz#1659445.
		fds = instance._evlist.get_pollfd()
		for fd in fds:
			poll.register(fd)
		while not instance._terminate.is_set():
			# timeout to poll in milliseconds
			if len(poll.poll(self._sleep_interval * 1000)) > 0 and not instance._terminate.is_set():
				read_events = True
				while read_events:
					read_events = False
					for cpu in self._cpus:
						event = instance._evlist.read_on_cpu(cpu)
						if event:
							read_events = True
							if isinstance(event, perf.comm_event) or (
								self._perf_process_fork_value
								and isinstance(event, perf.task_event)
								and event.type == perf.RECORD_FORK
							):
								self._add_pid(instance, int(event.tid), r)
							elif isinstance(event, perf.task_event) and event.type == perf.RECORD_EXIT:
								self._remove_pid(instance, int(event.tid))

@command_custom("cgroup_ps_blacklist", per_device = False)
	def _cgroup_ps_blacklist(self, enabling, value, verify, ignore_missing, instance):
		# currently unsupported
		if verify:
			return None
		if enabling and value is not None:
			self._cgroup_ps_blacklist_re = "|".join(["(%s)" % v for v in re.split(r"(?<!\\);", str(value))])

@command_custom("ps_whitelist", per_device = False)
	def _ps_whitelist(self, enabling, value, verify, ignore_missing, instance):
		# currently unsupported
		if verify:
			return None
		if enabling and value is not None:
			self._ps_whitelist = "|".join(["(%s)" % v for v in re.split(r"(?<!\\);", str(value))])

@command_custom("ps_blacklist", per_device = False)
	def _ps_blacklist(self, enabling, value, verify, ignore_missing, instance):
		# currently unsupported
		if verify:
			return None
		if enabling and value is not None:
			self._ps_blacklist = "|".join(["(%s)" % v for v in re.split(r"(?<!\\);", str(value))])

@command_custom("kthread_process", per_device = False)
	def _kthread_process(self, enabling, value, verify, ignore_missing, instance):
		# currently unsupported
		if verify:
			return None
		if enabling and value is not None:
			self._kthread_process = self._cmd.get_bool(value) == "1"

@command_custom("irq_process", per_device = False)
	def _irq_process(self, enabling, value, verify, ignore_missing, instance):
		# currently unsupported
		if verify:
			return None
		if enabling and value is not None:
			self._irq_process = self._cmd.get_bool(value) == "1"

@command_custom("default_irq_smp_affinity", per_device = False)
	def _default_irq_smp_affinity(self, enabling, value, verify, ignore_missing, instance):
		# currently unsupported
		if verify:
			return None
		if enabling and value is not None:
			if value in ["calc", "ignore"]:
				self._default_irq_smp_affinity_value = value
			else:
				self._default_irq_smp_affinity_value = self._cmd.cpulist_unpack(value)

@command_custom("perf_process_fork", per_device = False)
	def _perf_process_fork(self, enabling, value, verify, ignore_missing, instance):
		# currently unsupported
		if verify:
			return None
		if enabling and value is not None:
			self._perf_process_fork_value = self._cmd.get_bool(value) == "1"

# Raises OSError
	# Raises SystemError with old (pre-0.4) python-schedutils
	# instead of OSError
	# If PID doesn't exist, errno == ESRCH
	def _get_affinity(self, pid):
		res = self._scheduler_utils.get_affinity(pid)
		log.debug("Read affinity '%s' of PID %d" % (res, pid))
		return res

def _set_affinity(self, pid, affinity):
		process = procfs.process(pid)
		if self._process_in_blacklisted_cgroup(process):
			log.debug("Not setting CPU affinity of PID %d, the task belongs to a blacklisted cgroup." % pid)
			return
		log.debug("Setting CPU affinity of PID %d to '%s'." % (pid, affinity))
		try:
			self._scheduler_utils.set_affinity(pid, affinity)
		# Workaround for old python-schedutils (pre-0.4) which
		# incorrectly raised SystemError instead of OSError
		except (SystemError, OSError) as e:
			if not self._ignore_set_affinity_error(process):
				log.error("Failed to set affinity of PID %d to '%s': %s"
						% (pid, affinity, e))

# returns intersection of affinity1 with affinity2, if intersection is empty it returns affinity3
	def _get_intersect_affinity(self, affinity1, affinity2, affinity3):
		aff = set(affinity1).intersection(set(affinity2))
		if aff:
			return list(aff)
		return affinity3

def _set_all_obj_affinity(self, objs, affinity, threads = False):
		psl = objs
		if not self._kthread_process:
			psl = [v for v in psl if not self._is_kthread(v)]
		psl = [v for v in psl if re.search(self._ps_whitelist,
				self._get_stat_comm(v)) is not None]
		if self._ps_blacklist != "":
			psl = [v for v in psl if re.search(self._ps_blacklist,
					self._get_stat_comm(v)) is None]
		psd = dict([(v.pid, v) for v in psl])
		for pid in psd:
			try:
				cmd = self._get_cmdline(psd[pid])
			except (OSError, IOError) as e:
				if e.errno == errno.ENOENT \
						or e.errno == errno.ESRCH:
					log.debug("Failed to get cmdline of PID %d, the task vanished."
							% pid)
				else:
					log.error("Refusing to set affinity of PID %d, failed to get its cmdline: %s"
							% (pid, e))
				continue
			cont = self._tune_process_affinity(pid, affinity,
					intersect = True)
			if not cont:
				continue
			if pid in self._scheduler_original:
				self._scheduler_original[pid].cmdline = cmd
			# process threads
			if not threads and "threads" in psd[pid]:
				self._set_all_obj_affinity(
						psd[pid]["threads"].values(),
						affinity, True)

def _get_stat_cgroup(self, o):
		try:
			return o["cgroups"]
		except (OSError, IOError, KeyError):
			return ""

def _get_stat_comm(self, o):
		try:
			return o["stat"]["comm"]
		except (OSError, IOError, KeyError):
			return ""

def _set_ps_affinity(self, affinity):
		try:
			ps = procfs.pidstats()
			ps.reload_threads()
			self._set_all_obj_affinity(ps.values(), affinity, False)
		except (OSError, IOError) as e:
			log.error("error applying tuning, cannot get information about running processes: %s"
					% e)

# Returns 0 on success, -2 if changing the affinity is not
	# supported, -1 if some other error occurs.
	def _set_irq_affinity(self, irq, affinity, restoring):
		try:
			affinity_hex = self._cmd.cpulist2hex(affinity)
			log.debug("Setting SMP affinity of IRQ %s to '%s'"
					% (irq, affinity_hex))
			filename = "/proc/irq/%s/smp_affinity" % irq
			with open(filename, "w") as f:
				f.write(affinity_hex)
			return 0
		except (OSError, IOError) as e:
			# EIO is returned by
			# kernel/irq/proc.c:write_irq_affinity() if changing
			# the affinity is not supported
			# (at least on kernels 3.10 and 4.18)
			if hasattr(e, "errno") and e.errno == errno.EIO \
					and not restoring:
				log.debug("Setting SMP affinity of IRQ %s is not supported"
						% irq)
				return -2
			else:
				log.error("Failed to set SMP affinity of IRQ %s to '%s': %s"
						% (irq, affinity_hex, e))
				return -1

def _set_default_irq_affinity(self, affinity):
		try:
			affinity_hex = self._cmd.cpulist2hex(affinity)
			log.debug("Setting default SMP IRQ affinity to '%s'"
					% affinity_hex)
			with open("/proc/irq/default_smp_affinity", "w") as f:
				f.write(affinity_hex)
		except (OSError, IOError) as e:
			log.error("Failed to set default SMP IRQ affinity to '%s': %s"
					% (affinity_hex, e))

def _set_all_irq_affinity(self, affinity):
		irq_original = IRQAffinities()
		irqs = procfs.interrupts()
		for irq in irqs.keys():
			try:
				prev_affinity = irqs[irq]["affinity"]
				log.debug("Read affinity of IRQ '%s': '%s'"
						% (irq, prev_affinity))
			except KeyError:
				continue
			_affinity = self._get_intersect_affinity(prev_affinity, affinity, affinity)
			if set(_affinity) == set(prev_affinity):
				continue
			res = self._set_irq_affinity(irq, _affinity, False)
			if res == 0:
				irq_original.irqs[irq] = prev_affinity
			elif res == -2:
				irq_original.unchangeable.append(irq)

# default affinity
		prev_affinity_hex = self._cmd.read_file("/proc/irq/default_smp_affinity")
		prev_affinity = self._cmd.hex2cpulist(prev_affinity_hex)
		if self._default_irq_smp_affinity_value == "calc":
			_affinity = self._get_intersect_affinity(prev_affinity, affinity, affinity)
		elif self._default_irq_smp_affinity_value != "ignore":
			_affinity = self._default_irq_smp_affinity_value
		if self._default_irq_smp_affinity_value != "ignore":
			self._set_default_irq_affinity(_affinity)
			irq_original.default = prev_affinity
		self._storage.set(self._irq_storage_key, irq_original)

def _restore_all_irq_affinity(self):
		irq_original = self._storage.get(self._irq_storage_key, None)
		if irq_original is None:
			return
		for irq, affinity in irq_original.irqs.items():
			self._set_irq_affinity(irq, affinity, True)
		if self._default_irq_smp_affinity_value != "ignore":
			affinity = irq_original.default
			self._set_default_irq_affinity(affinity)
		self._storage.unset(self._irq_storage_key)

def _verify_irq_affinity(self, irq_description, correct_affinity,
			current_affinity):
		res = set(current_affinity).issubset(set(correct_affinity))
		if res:
			log.info(consts.STR_VERIFY_PROFILE_VALUE_OK
					% (irq_description, current_affinity))
		else:
			log.error(consts.STR_VERIFY_PROFILE_VALUE_FAIL
					% (irq_description, current_affinity,
					correct_affinity))
		return res

def _verify_all_irq_affinity(self, correct_affinity, ignore_missing):
		irq_original = self._storage.get(self._irq_storage_key, None)
		irqs = procfs.interrupts()
		res = True
		for irq in irqs.keys():
			if irq in irq_original.unchangeable and ignore_missing:
				description = "IRQ %s does not support changing SMP affinity" % irq
				log.info(consts.STR_VERIFY_PROFILE_VALUE_MISSING % description)
				continue
			try:
				current_affinity = irqs[irq]["affinity"]
				log.debug("Read SMP affinity of IRQ '%s': '%s'"
						% (irq, current_affinity))
				irq_description = "SMP affinity of IRQ %s" % irq
				if not self._verify_irq_affinity(
						irq_description,
						correct_affinity,
						current_affinity):
					res = False
			except KeyError:
				continue

current_affinity_hex = self._cmd.read_file(
				"/proc/irq/default_smp_affinity")
		current_affinity = self._cmd.hex2cpulist(current_affinity_hex)
		if self._default_irq_smp_affinity_value != "ignore" and not self._verify_irq_affinity("default IRQ SMP affinity",
				current_affinity, correct_affinity if self._default_irq_smp_affinity_value == "calc" else
				self._default_irq_smp_affinity_value):
			res = False
		return res

@command_custom("isolated_cores", per_device = False, priority = 10)
	def _isolated_cores(self, enabling, value, verify, ignore_missing, instance):
		affinity = None
		self._affinity = None
		if value is not None:
			isolated = set(self._cmd.cpulist_unpack(value))
			present = set(self._cpus)
			if isolated.issubset(present):
				affinity = list(present - isolated)
				self._affinity = self._cmd.cpulist2string(affinity)
			else:
				str_cpus = self._cmd.cpulist2string(self._cpus)
				log.error("Invalid isolated_cores specified, '%s' does not match available cores '%s'"
						% (value, str_cpus))
		if (enabling or verify) and affinity is None:
			return None
		# currently only IRQ affinity verification is supported
		if verify:
			if self._irq_process:
				return self._verify_all_irq_affinity(affinity, ignore_missing)
			return True
		elif enabling:
			if self._cgroup:
				self._cgroup_set_affinity()
				ps_affinity = "cgroup.%s" % self._cgroup
			else:
				ps_affinity = affinity
			self._set_ps_affinity(ps_affinity)
			if self._irq_process:
				self._set_all_irq_affinity(affinity)
		else:
			# Restoring processes' affinity is done in
			# _instance_unapply_static()
			if self._irq_process:
				self._restore_all_irq_affinity()
		return True

def _sched_assembly_path(self, prefix, namespace, knob):
		if prefix == "":
			path = "%s/%s" % (namespace, knob)
		else:
			path = "%s/%s/%s" % (prefix, namespace, knob)
		return "/sys/kernel/debug/%s" % path

# map to kernel 6.6 paths, "" means that knob was dropped
	def _sched_assembly_path2(self, path, prefix, namespace, knob):
		lpath = path
		if namespace == "sched":
			lknob = self._dict_sched_knob_map.get(knob)
			if lknob is not None:
				if lknob:
					lpath = self._sched_assembly_path(prefix, namespace, lknob)
				else:
					lpath = ""
		return lpath

def _get_sched_knob_path(self, prefix, namespace, knob):
		key = "%s_%s_%s" % (prefix, namespace, knob)
		path = self._sched_knob_paths_cache.get(key)
		if path or path == "":
			return path
		path = "/proc/sys/kernel/%s_%s" % (namespace, knob)
		if not os.path.exists(path):
			path = self._sched_assembly_path(prefix, namespace, knob)
			# kernel 6.6 drops and renames some knobs
			if not os.path.exists(path):
				path = self._sched_assembly_path2(path, prefix, namespace, knob)
			if path != "" and self._secure_boot_hint is None:
				self._secure_boot_hint = True
		self._sched_knob_paths_cache[key] = path
		return path

def _get_sched_knob(self, prefix, namespace, knob):
		data = None
		path = self._get_sched_knob_path(prefix, namespace, knob)
		if path != "":
			data = self._cmd.read_file(path, err_ret = None)
			if data is None:
				log.error("Error reading '%s'" % knob)
				if self._secure_boot_hint:
					log.error("This may not work with Secure Boot or kernel_lockdown (this hint is logged only once)")
					self._secure_boot_hint = False
		return data

def _set_sched_knob(self, prefix, namespace, knob, value, sim, remove = False):
		if value is None:
			return None
		path = self._get_sched_knob_path(prefix, namespace, knob)
		if not path:
			log.debug("knob '%s' ignored, unsupported by kernel" % knob)
			return None
		if not sim:
			if not self._cmd.write_to_file(path, value, \
				no_error = [errno.ENOENT] if remove else False):
					log.error("Error writing value '%s' to '%s'" % (value, knob))
		return value

@command_get("sched_min_granularity_ns")
	def _get_sched_min_granularity_ns(self, instance):
		return self._get_sched_knob("", "sched", "min_granularity_ns")

@command_set("sched_min_granularity_ns")
	def _set_sched_min_granularity_ns(self, value, instance, sim, remove):
		return self._set_sched_knob("", "sched", "min_granularity_ns", value, sim, remove)

@command_get("sched_base_slice_ns")
	def _get_sched_base_slice_ns(self, instance):
		return self._get_sched_min_granularity_ns(instance)

@command_set("sched_base_slice_ns")
	def _set_sched_base_slice_ns(self, value, instance, sim, remove):
		return self._set_sched_min_granularity_ns(value, instance, sim, remove)

@command_get("sched_latency_ns")
	def _get_sched_latency_ns(self, instance):
		return self._get_sched_knob("", "sched", "latency_ns")

@command_set("sched_latency_ns")
	def _set_sched_latency_ns(self, value, instance, sim, remove):
		return self._set_sched_knob("", "sched", "latency_ns", value, sim, remove)

@command_get("sched_wakeup_granularity_ns")
	def _get_sched_wakeup_granularity_ns(self, instance):
		return self._get_sched_knob("", "sched", "wakeup_granularity_ns")

@command_set("sched_wakeup_granularity_ns")
	def _set_sched_wakeup_granularity_ns(self, value, instance, sim, remove):
		return self._set_sched_knob("", "sched", "wakeup_granularity_ns", value, sim, remove)

@command_get("sched_tunable_scaling")
	def _get_sched_tunable_scaling(self, instance):
		return self._get_sched_knob("", "sched", "tunable_scaling")

@command_set("sched_tunable_scaling")
	def _set_sched_tunable_scaling(self, value, instance, sim, remove):
		return self._set_sched_knob("", "sched", "tunable_scaling", value, sim, remove)

@command_get("sched_migration_cost_ns")
	def _get_sched_migration_cost_ns(self, instance):
		return self._get_sched_knob("", "sched", "migration_cost_ns")

@command_set("sched_migration_cost_ns")
	def _set_sched_migration_cost_ns(self, value, instance, sim, remove):
		return self._set_sched_knob("", "sched", "migration_cost_ns", value, sim, remove)

@command_get("sched_nr_migrate")
	def _get_sched_nr_migrate(self, instance):
		return self._get_sched_knob("", "sched", "nr_migrate")

@command_set("sched_nr_migrate")
	def _set_sched_nr_migrate(self, value, instance, sim, remove):
		return self._set_sched_knob("", "sched", "nr_migrate", value, sim, remove)

@command_get("numa_balancing_scan_delay_ms")
	def _get_numa_balancing_scan_delay_ms(self, instance):
		return self._get_sched_knob("sched", "numa_balancing", "scan_delay_ms")

@command_set("numa_balancing_scan_delay_ms")
	def _set_numa_balancing_scan_delay_ms(self, value, instance, sim, remove):
		return self._set_sched_knob("sched", "numa_balancing", "scan_delay_ms", value, sim, remove)

@command_get("numa_balancing_scan_period_min_ms")
	def _get_numa_balancing_scan_period_min_ms(self, instance):
		return self._get_sched_knob("sched", "numa_balancing", "scan_period_min_ms")

@command_set("numa_balancing_scan_period_min_ms")
	def _set_numa_balancing_scan_period_min_ms(self, value, instance, sim, remove):
		return self._set_sched_knob("sched", "numa_balancing", "scan_period_min_ms", value, sim, remove)

@command_get("numa_balancing_scan_period_max_ms")
	def _get_numa_balancing_scan_period_max_ms(self, instance):
		return self._get_sched_knob("sched", "numa_balancing", "scan_period_max_ms")

@command_set("numa_balancing_scan_period_max_ms")
	def _set_numa_balancing_scan_period_max_ms(self, value, instance, sim, remove):
		return self._set_sched_knob("sched", "numa_balancing", "scan_period_max_ms", value, sim, remove)

@command_get("numa_balancing_scan_size_mb")
	def _get_numa_balancing_scan_size_mb(self, instance):
		return self._get_sched_knob("sched", "numa_balancing", "scan_size_mb")

@command_set("numa_balancing_scan_size_mb")
	def _set_numa_balancing_scan_size_mb(self, value, instance, sim, remove):
		return self._set_sched_knob("sched", "numa_balancing", "scan_size_mb", value, sim, remove)