linux-zen-server/drivers/iommu/intel/perfmon.c

// SPDX-License-Identifier: GPL-2.0-only
/*
 * Support Intel IOMMU PerfMon
 * Copyright(c) 2023 Intel Corporation.
 */
#define pr_fmt(fmt)	"DMAR: " fmt
#define dev_fmt(fmt)	pr_fmt(fmt)

#include <linux/dmar.h>
#include "iommu.h"
#include "perfmon.h"

PMU_FORMAT_ATTR(event,		"config:0-27");		/* ES: Events Select */
PMU_FORMAT_ATTR(event_group,	"config:28-31");	/* EGI: Event Group Index */

static struct attribute *iommu_pmu_format_attrs[] = {
	&format_attr_event_group.attr,
	&format_attr_event.attr,
	NULL
};

static struct attribute_group iommu_pmu_format_attr_group = {
	.name = "format",
	.attrs = iommu_pmu_format_attrs,
};

/* The available events are added in attr_update later */
static struct attribute *attrs_empty[] = {
	NULL
};

static struct attribute_group iommu_pmu_events_attr_group = {
	.name = "events",
	.attrs = attrs_empty,
};

static cpumask_t iommu_pmu_cpu_mask;

static ssize_t
cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
{
	return cpumap_print_to_pagebuf(true, buf, &iommu_pmu_cpu_mask);
}
static DEVICE_ATTR_RO(cpumask);

static struct attribute *iommu_pmu_cpumask_attrs[] = {
	&dev_attr_cpumask.attr,
	NULL
};

static struct attribute_group iommu_pmu_cpumask_attr_group = {
	.attrs = iommu_pmu_cpumask_attrs,
};

static const struct attribute_group *iommu_pmu_attr_groups[] = {
	&iommu_pmu_format_attr_group,
	&iommu_pmu_events_attr_group,
	&iommu_pmu_cpumask_attr_group,
	NULL
};

static inline struct iommu_pmu *dev_to_iommu_pmu(struct device *dev)
{
	/*
	 * The perf_event creates its own dev for each PMU.
	 * See pmu_dev_alloc()
	 */
	return container_of(dev_get_drvdata(dev), struct iommu_pmu, pmu);
}

#define IOMMU_PMU_ATTR(_name, _format, _filter)				\
	PMU_FORMAT_ATTR(_name, _format);				\
									\
static struct attribute *_name##_attr[] = {				\
	&format_attr_##_name.attr,					\
	NULL								\
};									\
									\
static umode_t								\
_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i)	\
{									\
	struct device *dev = kobj_to_dev(kobj);				\
	struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);		\
									\
	if (!iommu_pmu)							\
		return 0;						\
	return (iommu_pmu->filter & _filter) ? attr->mode : 0;		\
}									\
									\
static struct attribute_group _name = {					\
	.name		= "format",					\
	.attrs		= _name##_attr,					\
	.is_visible	= _name##_is_visible,				\
};

IOMMU_PMU_ATTR(filter_requester_id_en,	"config1:0",		IOMMU_PMU_FILTER_REQUESTER_ID);
IOMMU_PMU_ATTR(filter_domain_en,	"config1:1",		IOMMU_PMU_FILTER_DOMAIN);
IOMMU_PMU_ATTR(filter_pasid_en,		"config1:2",		IOMMU_PMU_FILTER_PASID);
IOMMU_PMU_ATTR(filter_ats_en,		"config1:3",		IOMMU_PMU_FILTER_ATS);
IOMMU_PMU_ATTR(filter_page_table_en,	"config1:4",		IOMMU_PMU_FILTER_PAGE_TABLE);
IOMMU_PMU_ATTR(filter_requester_id,	"config1:16-31",	IOMMU_PMU_FILTER_REQUESTER_ID);
IOMMU_PMU_ATTR(filter_domain,		"config1:32-47",	IOMMU_PMU_FILTER_DOMAIN);
IOMMU_PMU_ATTR(filter_pasid,		"config2:0-21",		IOMMU_PMU_FILTER_PASID);
IOMMU_PMU_ATTR(filter_ats,		"config2:24-28",	IOMMU_PMU_FILTER_ATS);
IOMMU_PMU_ATTR(filter_page_table,	"config2:32-36",	IOMMU_PMU_FILTER_PAGE_TABLE);

#define iommu_pmu_en_requester_id(e)		((e) & 0x1)
#define iommu_pmu_en_domain(e)			(((e) >> 1) & 0x1)
#define iommu_pmu_en_pasid(e)			(((e) >> 2) & 0x1)
#define iommu_pmu_en_ats(e)			(((e) >> 3) & 0x1)
#define iommu_pmu_en_page_table(e)		(((e) >> 4) & 0x1)
#define iommu_pmu_get_requester_id(filter)	(((filter) >> 16) & 0xffff)
#define iommu_pmu_get_domain(filter)		(((filter) >> 32) & 0xffff)
#define iommu_pmu_get_pasid(filter)		((filter) & 0x3fffff)
#define iommu_pmu_get_ats(filter)		(((filter) >> 24) & 0x1f)
#define iommu_pmu_get_page_table(filter)	(((filter) >> 32) & 0x1f)

#define iommu_pmu_set_filter(_name, _config, _filter, _idx, _econfig)		\
{										\
	if ((iommu_pmu->filter & _filter) && iommu_pmu_en_##_name(_econfig)) {	\
		dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET +	\
			    IOMMU_PMU_CFG_SIZE +				\
			    (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,	\
			    iommu_pmu_get_##_name(_config) | IOMMU_PMU_FILTER_EN);\
	}									\
}

#define iommu_pmu_clear_filter(_filter, _idx)					\
{										\
	if (iommu_pmu->filter & _filter) {					\
		dmar_writel(iommu_pmu->cfg_reg + _idx * IOMMU_PMU_CFG_OFFSET +	\
			    IOMMU_PMU_CFG_SIZE +				\
			    (ffs(_filter) - 1) * IOMMU_PMU_CFG_FILTERS_OFFSET,	\
			    0);							\
	}									\
}

/*
 * Define the event attr related functions
 * Input: _name: event attr name
 *        _string: string of the event in sysfs
 *        _g_idx: event group encoding
 *        _event: event encoding
 */
#define IOMMU_PMU_EVENT_ATTR(_name, _string, _g_idx, _event)			\
	PMU_EVENT_ATTR_STRING(_name, event_attr_##_name, _string)		\
										\
static struct attribute *_name##_attr[] = {					\
	&event_attr_##_name.attr.attr,						\
	NULL									\
};										\
										\
static umode_t									\
_name##_is_visible(struct kobject *kobj, struct attribute *attr, int i)		\
{										\
	struct device *dev = kobj_to_dev(kobj);					\
	struct iommu_pmu *iommu_pmu = dev_to_iommu_pmu(dev);			\
										\
	if (!iommu_pmu)								\
		return 0;							\
	return (iommu_pmu->evcap[_g_idx] & _event) ? attr->mode : 0;		\
}										\
										\
static struct attribute_group _name = {						\
	.name		= "events",						\
	.attrs		= _name##_attr,						\
	.is_visible	= _name##_is_visible,					\
};

IOMMU_PMU_EVENT_ATTR(iommu_clocks,		"event_group=0x0,event=0x001", 0x0, 0x001)
IOMMU_PMU_EVENT_ATTR(iommu_requests,		"event_group=0x0,event=0x002", 0x0, 0x002)
IOMMU_PMU_EVENT_ATTR(pw_occupancy,		"event_group=0x0,event=0x004", 0x0, 0x004)
IOMMU_PMU_EVENT_ATTR(ats_blocked,		"event_group=0x0,event=0x008", 0x0, 0x008)
IOMMU_PMU_EVENT_ATTR(iommu_mrds,		"event_group=0x1,event=0x001", 0x1, 0x001)
IOMMU_PMU_EVENT_ATTR(iommu_mem_blocked,		"event_group=0x1,event=0x020", 0x1, 0x020)
IOMMU_PMU_EVENT_ATTR(pg_req_posted,		"event_group=0x1,event=0x040", 0x1, 0x040)
IOMMU_PMU_EVENT_ATTR(ctxt_cache_lookup,		"event_group=0x2,event=0x001", 0x2, 0x001)
IOMMU_PMU_EVENT_ATTR(ctxt_cache_hit,		"event_group=0x2,event=0x002", 0x2, 0x002)
IOMMU_PMU_EVENT_ATTR(pasid_cache_lookup,	"event_group=0x2,event=0x004", 0x2, 0x004)
IOMMU_PMU_EVENT_ATTR(pasid_cache_hit,		"event_group=0x2,event=0x008", 0x2, 0x008)
IOMMU_PMU_EVENT_ATTR(ss_nonleaf_lookup,		"event_group=0x2,event=0x010", 0x2, 0x010)
IOMMU_PMU_EVENT_ATTR(ss_nonleaf_hit,		"event_group=0x2,event=0x020", 0x2, 0x020)
IOMMU_PMU_EVENT_ATTR(fs_nonleaf_lookup,		"event_group=0x2,event=0x040", 0x2, 0x040)
IOMMU_PMU_EVENT_ATTR(fs_nonleaf_hit,		"event_group=0x2,event=0x080", 0x2, 0x080)
IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_lookup,	"event_group=0x2,event=0x100", 0x2, 0x100)
IOMMU_PMU_EVENT_ATTR(hpt_nonleaf_hit,		"event_group=0x2,event=0x200", 0x2, 0x200)
IOMMU_PMU_EVENT_ATTR(iotlb_lookup,		"event_group=0x3,event=0x001", 0x3, 0x001)
IOMMU_PMU_EVENT_ATTR(iotlb_hit,			"event_group=0x3,event=0x002", 0x3, 0x002)
IOMMU_PMU_EVENT_ATTR(hpt_leaf_lookup,		"event_group=0x3,event=0x004", 0x3, 0x004)
IOMMU_PMU_EVENT_ATTR(hpt_leaf_hit,		"event_group=0x3,event=0x008", 0x3, 0x008)
IOMMU_PMU_EVENT_ATTR(int_cache_lookup,		"event_group=0x4,event=0x001", 0x4, 0x001)
IOMMU_PMU_EVENT_ATTR(int_cache_hit_nonposted,	"event_group=0x4,event=0x002", 0x4, 0x002)
IOMMU_PMU_EVENT_ATTR(int_cache_hit_posted,	"event_group=0x4,event=0x004", 0x4, 0x004)

static const struct attribute_group *iommu_pmu_attr_update[] = {
	&filter_requester_id_en,
	&filter_domain_en,
	&filter_pasid_en,
	&filter_ats_en,
	&filter_page_table_en,
	&filter_requester_id,
	&filter_domain,
	&filter_pasid,
	&filter_ats,
	&filter_page_table,
	&iommu_clocks,
	&iommu_requests,
	&pw_occupancy,
	&ats_blocked,
	&iommu_mrds,
	&iommu_mem_blocked,
	&pg_req_posted,
	&ctxt_cache_lookup,
	&ctxt_cache_hit,
	&pasid_cache_lookup,
	&pasid_cache_hit,
	&ss_nonleaf_lookup,
	&ss_nonleaf_hit,
	&fs_nonleaf_lookup,
	&fs_nonleaf_hit,
	&hpt_nonleaf_lookup,
	&hpt_nonleaf_hit,
	&iotlb_lookup,
	&iotlb_hit,
	&hpt_leaf_lookup,
	&hpt_leaf_hit,
	&int_cache_lookup,
	&int_cache_hit_nonposted,
	&int_cache_hit_posted,
	NULL
};

static inline void __iomem *
iommu_event_base(struct iommu_pmu *iommu_pmu, int idx)
{
	return iommu_pmu->cntr_reg + idx * iommu_pmu->cntr_stride;
}

static inline void __iomem *
iommu_config_base(struct iommu_pmu *iommu_pmu, int idx)
{
	return iommu_pmu->cfg_reg + idx * IOMMU_PMU_CFG_OFFSET;
}

static inline struct iommu_pmu *iommu_event_to_pmu(struct perf_event *event)
{
	return container_of(event->pmu, struct iommu_pmu, pmu);
}

static inline u64 iommu_event_config(struct perf_event *event)
{
	u64 config = event->attr.config;

	return (iommu_event_select(config) << IOMMU_EVENT_CFG_ES_SHIFT) |
	       (iommu_event_group(config) << IOMMU_EVENT_CFG_EGI_SHIFT) |
	       IOMMU_EVENT_CFG_INT;
}

static inline bool is_iommu_pmu_event(struct iommu_pmu *iommu_pmu,
				      struct perf_event *event)
{
	return event->pmu == &iommu_pmu->pmu;
}

static int iommu_pmu_validate_event(struct perf_event *event)
{
	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
	u32 event_group = iommu_event_group(event->attr.config);

	if (event_group >= iommu_pmu->num_eg)
		return -EINVAL;

	return 0;
}

static int iommu_pmu_validate_group(struct perf_event *event)
{
	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
	struct perf_event *sibling;
	int nr = 0;

	/*
	 * All events in a group must be scheduled simultaneously.
	 * Check whether there is enough counters for all the events.
	 */
	for_each_sibling_event(sibling, event->group_leader) {
		if (!is_iommu_pmu_event(iommu_pmu, sibling) ||
		    sibling->state <= PERF_EVENT_STATE_OFF)
			continue;

		if (++nr > iommu_pmu->num_cntr)
			return -EINVAL;
	}

	return 0;
}

static int iommu_pmu_event_init(struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;

	if (event->attr.type != event->pmu->type)
		return -ENOENT;

	/* sampling not supported */
	if (event->attr.sample_period)
		return -EINVAL;

	if (event->cpu < 0)
		return -EINVAL;

	if (iommu_pmu_validate_event(event))
		return -EINVAL;

	hwc->config = iommu_event_config(event);

	return iommu_pmu_validate_group(event);
}

static void iommu_pmu_event_update(struct perf_event *event)
{
	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
	struct hw_perf_event *hwc = &event->hw;
	u64 prev_count, new_count, delta;
	int shift = 64 - iommu_pmu->cntr_width;

again:
	prev_count = local64_read(&hwc->prev_count);
	new_count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
	if (local64_xchg(&hwc->prev_count, new_count) != prev_count)
		goto again;

	/*
	 * The counter width is enumerated. Always shift the counter
	 * before using it.
	 */
	delta = (new_count << shift) - (prev_count << shift);
	delta >>= shift;

	local64_add(delta, &event->count);
}

static void iommu_pmu_start(struct perf_event *event, int flags)
{
	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
	struct intel_iommu *iommu = iommu_pmu->iommu;
	struct hw_perf_event *hwc = &event->hw;
	u64 count;

	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(hwc->idx < 0 || hwc->idx >= IOMMU_PMU_IDX_MAX))
		return;

	if (flags & PERF_EF_RELOAD)
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));

	hwc->state = 0;

	/* Always reprogram the period */
	count = dmar_readq(iommu_event_base(iommu_pmu, hwc->idx));
	local64_set((&hwc->prev_count), count);

	/*
	 * The error of ecmd will be ignored.
	 * - The existing perf_event subsystem doesn't handle the error.
	 *   Only IOMMU PMU returns runtime HW error. We don't want to
	 *   change the existing generic interfaces for the specific case.
	 * - It's a corner case caused by HW, which is very unlikely to
	 *   happen. There is nothing SW can do.
	 * - The worst case is that the user will get <not count> with
	 *   perf command, which can give the user some hints.
	 */
	ecmd_submit_sync(iommu, DMA_ECMD_ENABLE, hwc->idx, 0);

	perf_event_update_userpage(event);
}

static void iommu_pmu_stop(struct perf_event *event, int flags)
{
	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
	struct intel_iommu *iommu = iommu_pmu->iommu;
	struct hw_perf_event *hwc = &event->hw;

	if (!(hwc->state & PERF_HES_STOPPED)) {
		ecmd_submit_sync(iommu, DMA_ECMD_DISABLE, hwc->idx, 0);

		iommu_pmu_event_update(event);

		hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
	}
}

static inline int
iommu_pmu_validate_per_cntr_event(struct iommu_pmu *iommu_pmu,
				  int idx, struct perf_event *event)
{
	u32 event_group = iommu_event_group(event->attr.config);
	u32 select = iommu_event_select(event->attr.config);

	if (!(iommu_pmu->cntr_evcap[idx][event_group] & select))
		return -EINVAL;

	return 0;
}

static int iommu_pmu_assign_event(struct iommu_pmu *iommu_pmu,
				  struct perf_event *event)
{
	struct hw_perf_event *hwc = &event->hw;
	int idx;

	/*
	 * The counters which support limited events are usually at the end.
	 * Schedule them first to accommodate more events.
	 */
	for (idx = iommu_pmu->num_cntr - 1; idx >= 0; idx--) {
		if (test_and_set_bit(idx, iommu_pmu->used_mask))
			continue;
		/* Check per-counter event capabilities */
		if (!iommu_pmu_validate_per_cntr_event(iommu_pmu, idx, event))
			break;
		clear_bit(idx, iommu_pmu->used_mask);
	}
	if (idx < 0)
		return -EINVAL;

	iommu_pmu->event_list[idx] = event;
	hwc->idx = idx;

	/* config events */
	dmar_writeq(iommu_config_base(iommu_pmu, idx), hwc->config);

	iommu_pmu_set_filter(requester_id, event->attr.config1,
			     IOMMU_PMU_FILTER_REQUESTER_ID, idx,
			     event->attr.config1);
	iommu_pmu_set_filter(domain, event->attr.config1,
			     IOMMU_PMU_FILTER_DOMAIN, idx,
			     event->attr.config1);
	iommu_pmu_set_filter(pasid, event->attr.config1,
			     IOMMU_PMU_FILTER_PASID, idx,
			     event->attr.config1);
	iommu_pmu_set_filter(ats, event->attr.config2,
			     IOMMU_PMU_FILTER_ATS, idx,
			     event->attr.config1);
	iommu_pmu_set_filter(page_table, event->attr.config2,
			     IOMMU_PMU_FILTER_PAGE_TABLE, idx,
			     event->attr.config1);

	return 0;
}

static int iommu_pmu_add(struct perf_event *event, int flags)
{
	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
	struct hw_perf_event *hwc = &event->hw;
	int ret;

	ret = iommu_pmu_assign_event(iommu_pmu, event);
	if (ret < 0)
		return ret;

	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;

	if (flags & PERF_EF_START)
		iommu_pmu_start(event, 0);

	return 0;
}

static void iommu_pmu_del(struct perf_event *event, int flags)
{
	struct iommu_pmu *iommu_pmu = iommu_event_to_pmu(event);
	int idx = event->hw.idx;

	iommu_pmu_stop(event, PERF_EF_UPDATE);

	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_REQUESTER_ID, idx);
	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_DOMAIN, idx);
	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PASID, idx);
	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_ATS, idx);
	iommu_pmu_clear_filter(IOMMU_PMU_FILTER_PAGE_TABLE, idx);

	iommu_pmu->event_list[idx] = NULL;
	event->hw.idx = -1;
	clear_bit(idx, iommu_pmu->used_mask);

	perf_event_update_userpage(event);
}

static void iommu_pmu_enable(struct pmu *pmu)
{
	struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
	struct intel_iommu *iommu = iommu_pmu->iommu;

	ecmd_submit_sync(iommu, DMA_ECMD_UNFREEZE, 0, 0);
}

static void iommu_pmu_disable(struct pmu *pmu)
{
	struct iommu_pmu *iommu_pmu = container_of(pmu, struct iommu_pmu, pmu);
	struct intel_iommu *iommu = iommu_pmu->iommu;

	ecmd_submit_sync(iommu, DMA_ECMD_FREEZE, 0, 0);
}

static void iommu_pmu_counter_overflow(struct iommu_pmu *iommu_pmu)
{
	struct perf_event *event;
	u64 status;
	int i;

	/*
	 * Two counters may be overflowed very close. Always check
	 * whether there are more to handle.
	 */
	while ((status = dmar_readq(iommu_pmu->overflow))) {
		for_each_set_bit(i, (unsigned long *)&status, iommu_pmu->num_cntr) {
			/*
			 * Find the assigned event of the counter.
			 * Accumulate the value into the event->count.
			 */
			event = iommu_pmu->event_list[i];
			if (!event) {
				pr_warn_once("Cannot find the assigned event for counter %d\n", i);
				continue;
			}
			iommu_pmu_event_update(event);
		}

		dmar_writeq(iommu_pmu->overflow, status);
	}
}

static irqreturn_t iommu_pmu_irq_handler(int irq, void *dev_id)
{
	struct intel_iommu *iommu = dev_id;

	if (!dmar_readl(iommu->reg + DMAR_PERFINTRSTS_REG))
		return IRQ_NONE;

	iommu_pmu_counter_overflow(iommu->pmu);

	/* Clear the status bit */
	dmar_writel(iommu->reg + DMAR_PERFINTRSTS_REG, DMA_PERFINTRSTS_PIS);

	return IRQ_HANDLED;
}

static int __iommu_pmu_register(struct intel_iommu *iommu)
{
	struct iommu_pmu *iommu_pmu = iommu->pmu;

	iommu_pmu->pmu.name		= iommu->name;
	iommu_pmu->pmu.task_ctx_nr	= perf_invalid_context;
	iommu_pmu->pmu.event_init	= iommu_pmu_event_init;
	iommu_pmu->pmu.pmu_enable	= iommu_pmu_enable;
	iommu_pmu->pmu.pmu_disable	= iommu_pmu_disable;
	iommu_pmu->pmu.add		= iommu_pmu_add;
	iommu_pmu->pmu.del		= iommu_pmu_del;
	iommu_pmu->pmu.start		= iommu_pmu_start;
	iommu_pmu->pmu.stop		= iommu_pmu_stop;
	iommu_pmu->pmu.read		= iommu_pmu_event_update;
	iommu_pmu->pmu.attr_groups	= iommu_pmu_attr_groups;
	iommu_pmu->pmu.attr_update	= iommu_pmu_attr_update;
	iommu_pmu->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
	iommu_pmu->pmu.module		= THIS_MODULE;

	return perf_pmu_register(&iommu_pmu->pmu, iommu_pmu->pmu.name, -1);
}

static inline void __iomem *
get_perf_reg_address(struct intel_iommu *iommu, u32 offset)
{
	u32 off = dmar_readl(iommu->reg + offset);

	return iommu->reg + off;
}

int alloc_iommu_pmu(struct intel_iommu *iommu)
{
	struct iommu_pmu *iommu_pmu;
	int i, j, ret;
	u64 perfcap;
	u32 cap;

	if (!ecap_pms(iommu->ecap))
		return 0;

	/* The IOMMU PMU requires the ECMD support as well */
	if (!cap_ecmds(iommu->cap))
		return -ENODEV;

	perfcap = dmar_readq(iommu->reg + DMAR_PERFCAP_REG);
	/* The performance monitoring is not supported. */
	if (!perfcap)
		return -ENODEV;

	/* Sanity check for the number of the counters and event groups */
	if (!pcap_num_cntr(perfcap) || !pcap_num_event_group(perfcap))
		return -ENODEV;

	/* The interrupt on overflow is required */
	if (!pcap_interrupt(perfcap))
		return -ENODEV;

	/* Check required Enhanced Command Capability */
	if (!ecmd_has_pmu_essential(iommu))
		return -ENODEV;

	iommu_pmu = kzalloc(sizeof(*iommu_pmu), GFP_KERNEL);
	if (!iommu_pmu)
		return -ENOMEM;

	iommu_pmu->num_cntr = pcap_num_cntr(perfcap);
	if (iommu_pmu->num_cntr > IOMMU_PMU_IDX_MAX) {
		pr_warn_once("The number of IOMMU counters %d > max(%d), clipping!",
			     iommu_pmu->num_cntr, IOMMU_PMU_IDX_MAX);
		iommu_pmu->num_cntr = IOMMU_PMU_IDX_MAX;
	}

	iommu_pmu->cntr_width = pcap_cntr_width(perfcap);
	iommu_pmu->filter = pcap_filters_mask(perfcap);
	iommu_pmu->cntr_stride = pcap_cntr_stride(perfcap);
	iommu_pmu->num_eg = pcap_num_event_group(perfcap);

	iommu_pmu->evcap = kcalloc(iommu_pmu->num_eg, sizeof(u64), GFP_KERNEL);
	if (!iommu_pmu->evcap) {
		ret = -ENOMEM;
		goto free_pmu;
	}

	/* Parse event group capabilities */
	for (i = 0; i < iommu_pmu->num_eg; i++) {
		u64 pcap;

		pcap = dmar_readq(iommu->reg + DMAR_PERFEVNTCAP_REG +
				  i * IOMMU_PMU_CAP_REGS_STEP);
		iommu_pmu->evcap[i] = pecap_es(pcap);
	}

	iommu_pmu->cntr_evcap = kcalloc(iommu_pmu->num_cntr, sizeof(u32 *), GFP_KERNEL);
	if (!iommu_pmu->cntr_evcap) {
		ret = -ENOMEM;
		goto free_pmu_evcap;
	}
	for (i = 0; i < iommu_pmu->num_cntr; i++) {
		iommu_pmu->cntr_evcap[i] = kcalloc(iommu_pmu->num_eg, sizeof(u32), GFP_KERNEL);
		if (!iommu_pmu->cntr_evcap[i]) {
			ret = -ENOMEM;
			goto free_pmu_cntr_evcap;
		}
		/*
		 * Set to the global capabilities, will adjust according
		 * to per-counter capabilities later.
		 */
		for (j = 0; j < iommu_pmu->num_eg; j++)
			iommu_pmu->cntr_evcap[i][j] = (u32)iommu_pmu->evcap[j];
	}

	iommu_pmu->cfg_reg = get_perf_reg_address(iommu, DMAR_PERFCFGOFF_REG);
	iommu_pmu->cntr_reg = get_perf_reg_address(iommu, DMAR_PERFCNTROFF_REG);
	iommu_pmu->overflow = get_perf_reg_address(iommu, DMAR_PERFOVFOFF_REG);

	/*
	 * Check per-counter capabilities. All counters should have the
	 * same capabilities on Interrupt on Overflow Support and Counter
	 * Width.
	 */
	for (i = 0; i < iommu_pmu->num_cntr; i++) {
		cap = dmar_readl(iommu_pmu->cfg_reg +
				 i * IOMMU_PMU_CFG_OFFSET +
				 IOMMU_PMU_CFG_CNTRCAP_OFFSET);
		if (!iommu_cntrcap_pcc(cap))
			continue;

		/*
		 * It's possible that some counters have a different
		 * capability because of e.g., HW bug. Check the corner
		 * case here and simply drop those counters.
		 */
		if ((iommu_cntrcap_cw(cap) != iommu_pmu->cntr_width) ||
		    !iommu_cntrcap_ios(cap)) {
			iommu_pmu->num_cntr = i;
			pr_warn("PMU counter capability inconsistent, counter number reduced to %d\n",
				iommu_pmu->num_cntr);
		}

		/* Clear the pre-defined events group */
		for (j = 0; j < iommu_pmu->num_eg; j++)
			iommu_pmu->cntr_evcap[i][j] = 0;

		/* Override with per-counter event capabilities */
		for (j = 0; j < iommu_cntrcap_egcnt(cap); j++) {
			cap = dmar_readl(iommu_pmu->cfg_reg + i * IOMMU_PMU_CFG_OFFSET +
					 IOMMU_PMU_CFG_CNTREVCAP_OFFSET +
					 (j * IOMMU_PMU_OFF_REGS_STEP));
			iommu_pmu->cntr_evcap[i][iommu_event_group(cap)] = iommu_event_select(cap);
			/*
			 * Some events may only be supported by a specific counter.
			 * Track them in the evcap as well.
			 */
			iommu_pmu->evcap[iommu_event_group(cap)] |= iommu_event_select(cap);
		}
	}

	iommu_pmu->iommu = iommu;
	iommu->pmu = iommu_pmu;

	return 0;

free_pmu_cntr_evcap:
	for (i = 0; i < iommu_pmu->num_cntr; i++)
		kfree(iommu_pmu->cntr_evcap[i]);
	kfree(iommu_pmu->cntr_evcap);
free_pmu_evcap:
	kfree(iommu_pmu->evcap);
free_pmu:
	kfree(iommu_pmu);

	return ret;
}

void free_iommu_pmu(struct intel_iommu *iommu)
{
	struct iommu_pmu *iommu_pmu = iommu->pmu;

	if (!iommu_pmu)
		return;

	if (iommu_pmu->evcap) {
		int i;

		for (i = 0; i < iommu_pmu->num_cntr; i++)
			kfree(iommu_pmu->cntr_evcap[i]);
		kfree(iommu_pmu->cntr_evcap);
	}
	kfree(iommu_pmu->evcap);
	kfree(iommu_pmu);
	iommu->pmu = NULL;
}

static int iommu_pmu_set_interrupt(struct intel_iommu *iommu)
{
	struct iommu_pmu *iommu_pmu = iommu->pmu;
	int irq, ret;

	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PERF + iommu->seq_id, iommu->node, iommu);
	if (irq <= 0)
		return -EINVAL;

	snprintf(iommu_pmu->irq_name, sizeof(iommu_pmu->irq_name), "dmar%d-perf", iommu->seq_id);

	iommu->perf_irq = irq;
	ret = request_threaded_irq(irq, NULL, iommu_pmu_irq_handler,
				   IRQF_ONESHOT, iommu_pmu->irq_name, iommu);
	if (ret) {
		dmar_free_hwirq(irq);
		iommu->perf_irq = 0;
		return ret;
	}
	return 0;
}

static void iommu_pmu_unset_interrupt(struct intel_iommu *iommu)
{
	if (!iommu->perf_irq)
		return;

	free_irq(iommu->perf_irq, iommu);
	dmar_free_hwirq(iommu->perf_irq);
	iommu->perf_irq = 0;
}

static int iommu_pmu_cpu_online(unsigned int cpu, struct hlist_node *node)
{
	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);

	if (cpumask_empty(&iommu_pmu_cpu_mask))
		cpumask_set_cpu(cpu, &iommu_pmu_cpu_mask);

	if (cpumask_test_cpu(cpu, &iommu_pmu_cpu_mask))
		iommu_pmu->cpu = cpu;

	return 0;
}

static int iommu_pmu_cpu_offline(unsigned int cpu, struct hlist_node *node)
{
	struct iommu_pmu *iommu_pmu = hlist_entry_safe(node, typeof(*iommu_pmu), cpuhp_node);
	int target = cpumask_first(&iommu_pmu_cpu_mask);

	/*
	 * The iommu_pmu_cpu_mask has been updated when offline the CPU
	 * for the first iommu_pmu. Migrate the other iommu_pmu to the
	 * new target.
	 */
	if (target < nr_cpu_ids && target != iommu_pmu->cpu) {
		perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
		iommu_pmu->cpu = target;
		return 0;
	}

	if (!cpumask_test_and_clear_cpu(cpu, &iommu_pmu_cpu_mask))
		return 0;

	target = cpumask_any_but(cpu_online_mask, cpu);

	if (target < nr_cpu_ids)
		cpumask_set_cpu(target, &iommu_pmu_cpu_mask);
	else
		return 0;

	perf_pmu_migrate_context(&iommu_pmu->pmu, cpu, target);
	iommu_pmu->cpu = target;

	return 0;
}

static int nr_iommu_pmu;
static enum cpuhp_state iommu_cpuhp_slot;

static int iommu_pmu_cpuhp_setup(struct iommu_pmu *iommu_pmu)
{
	int ret;

	if (!nr_iommu_pmu) {
		ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
					      "driver/iommu/intel/perfmon:online",
					      iommu_pmu_cpu_online,
					      iommu_pmu_cpu_offline);
		if (ret < 0)
			return ret;
		iommu_cpuhp_slot = ret;
	}

	ret = cpuhp_state_add_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);
	if (ret) {
		if (!nr_iommu_pmu)
			cpuhp_remove_multi_state(iommu_cpuhp_slot);
		return ret;
	}
	nr_iommu_pmu++;

	return 0;
}

static void iommu_pmu_cpuhp_free(struct iommu_pmu *iommu_pmu)
{
	cpuhp_state_remove_instance(iommu_cpuhp_slot, &iommu_pmu->cpuhp_node);

	if (--nr_iommu_pmu)
		return;

	cpuhp_remove_multi_state(iommu_cpuhp_slot);
}

void iommu_pmu_register(struct intel_iommu *iommu)
{
	struct iommu_pmu *iommu_pmu = iommu->pmu;

	if (!iommu_pmu)
		return;

	if (__iommu_pmu_register(iommu))
		goto err;

	if (iommu_pmu_cpuhp_setup(iommu_pmu))
		goto unregister;

	/* Set interrupt for overflow */
	if (iommu_pmu_set_interrupt(iommu))
		goto cpuhp_free;

	return;

cpuhp_free:
	iommu_pmu_cpuhp_free(iommu_pmu);
unregister:
	perf_pmu_unregister(&iommu_pmu->pmu);
err:
	pr_err("Failed to register PMU for iommu (seq_id = %d)\n", iommu->seq_id);
	free_iommu_pmu(iommu);
}

void iommu_pmu_unregister(struct intel_iommu *iommu)
{
	struct iommu_pmu *iommu_pmu = iommu->pmu;

	if (!iommu_pmu)
		return;

	iommu_pmu_unset_interrupt(iommu);
	iommu_pmu_cpuhp_free(iommu_pmu);
	perf_pmu_unregister(&iommu_pmu->pmu);
}