linux-zen-desktop/drivers/iommu/iommufd/vfio_compat.c

// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
 */
#include <linux/file.h>
#include <linux/interval_tree.h>
#include <linux/iommu.h>
#include <linux/iommufd.h>
#include <linux/slab.h>
#include <linux/vfio.h>
#include <uapi/linux/vfio.h>
#include <uapi/linux/iommufd.h>

#include "iommufd_private.h"

static struct iommufd_ioas *get_compat_ioas(struct iommufd_ctx *ictx)
{
	struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);

	xa_lock(&ictx->objects);
	if (!ictx->vfio_ioas || !iommufd_lock_obj(&ictx->vfio_ioas->obj))
		goto out_unlock;
	ioas = ictx->vfio_ioas;
out_unlock:
	xa_unlock(&ictx->objects);
	return ioas;
}

/**
 * iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists
 * @ictx: Context to operate on
 * @out_ioas_id: The IOAS ID of the compatibility IOAS
 *
 * Return the ID of the current compatibility IOAS. The ID can be passed into
 * other functions that take an ioas_id.
 */
int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id)
{
	struct iommufd_ioas *ioas;

	ioas = get_compat_ioas(ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);
	*out_ioas_id = ioas->obj.id;
	iommufd_put_object(&ioas->obj);
	return 0;
}
EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);

/**
 * iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached
 * @ictx: Context to operate on
 *
 * This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.
 */
int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
{
	int ret;

	xa_lock(&ictx->objects);
	if (!ictx->vfio_ioas) {
		ictx->no_iommu_mode = 1;
		ret = 0;
	} else {
		ret = -EINVAL;
	}
	xa_unlock(&ictx->objects);
	return ret;
}
EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);

/**
 * iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created
 * @ictx: Context to operate on
 *
 * The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate
 * on since they do not have an IOAS ID input in their ABI. Only attaching a
 * group should cause a default creation of the internal ioas, this does nothing
 * if an existing ioas has already been assigned somehow.
 */
int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
{
	struct iommufd_ioas *ioas = NULL;
	int ret;

	ioas = iommufd_ioas_alloc(ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	xa_lock(&ictx->objects);
	/*
	 * VFIO won't allow attaching a container to both iommu and no iommu
	 * operation
	 */
	if (ictx->no_iommu_mode) {
		ret = -EINVAL;
		goto out_abort;
	}

	if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {
		ret = 0;
		iommufd_put_object(&ictx->vfio_ioas->obj);
		goto out_abort;
	}
	ictx->vfio_ioas = ioas;
	xa_unlock(&ictx->objects);

	/*
	 * An automatically created compat IOAS is treated as a userspace
	 * created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,
	 * and if not manually destroyed it will be destroyed automatically
	 * at iommufd release.
	 */
	iommufd_object_finalize(ictx, &ioas->obj);
	return 0;

out_abort:
	xa_unlock(&ictx->objects);
	iommufd_object_abort(ictx, &ioas->obj);
	return ret;
}
EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);

int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)
{
	struct iommu_vfio_ioas *cmd = ucmd->cmd;
	struct iommufd_ioas *ioas;

	if (cmd->__reserved)
		return -EOPNOTSUPP;
	switch (cmd->op) {
	case IOMMU_VFIO_IOAS_GET:
		ioas = get_compat_ioas(ucmd->ictx);
		if (IS_ERR(ioas))
			return PTR_ERR(ioas);
		cmd->ioas_id = ioas->obj.id;
		iommufd_put_object(&ioas->obj);
		return iommufd_ucmd_respond(ucmd, sizeof(*cmd));

	case IOMMU_VFIO_IOAS_SET:
		ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);
		if (IS_ERR(ioas))
			return PTR_ERR(ioas);
		xa_lock(&ucmd->ictx->objects);
		ucmd->ictx->vfio_ioas = ioas;
		xa_unlock(&ucmd->ictx->objects);
		iommufd_put_object(&ioas->obj);
		return 0;

	case IOMMU_VFIO_IOAS_CLEAR:
		xa_lock(&ucmd->ictx->objects);
		ucmd->ictx->vfio_ioas = NULL;
		xa_unlock(&ucmd->ictx->objects);
		return 0;
	default:
		return -EOPNOTSUPP;
	}
}

static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,
				void __user *arg)
{
	u32 supported_flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
	struct vfio_iommu_type1_dma_map map;
	int iommu_prot = IOMMU_CACHE;
	struct iommufd_ioas *ioas;
	unsigned long iova;
	int rc;

	if (copy_from_user(&map, arg, minsz))
		return -EFAULT;

	if (map.argsz < minsz || map.flags & ~supported_flags)
		return -EINVAL;

	if (map.flags & VFIO_DMA_MAP_FLAG_READ)
		iommu_prot |= IOMMU_READ;
	if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)
		iommu_prot |= IOMMU_WRITE;

	ioas = get_compat_ioas(ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	/*
	 * Maps created through the legacy interface always use VFIO compatible
	 * rlimit accounting. If the user wishes to use the faster user based
	 * rlimit accounting then they must use the new interface.
	 */
	iova = map.iova;
	rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),
				 map.size, iommu_prot, 0);
	iommufd_put_object(&ioas->obj);
	return rc;
}

static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,
				  void __user *arg)
{
	size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
	/*
	 * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new
	 * dirty tracking direction:
	 *  https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/
	 *  https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/
	 */
	u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;
	struct vfio_iommu_type1_dma_unmap unmap;
	unsigned long unmapped = 0;
	struct iommufd_ioas *ioas;
	int rc;

	if (copy_from_user(&unmap, arg, minsz))
		return -EFAULT;

	if (unmap.argsz < minsz || unmap.flags & ~supported_flags)
		return -EINVAL;

	ioas = get_compat_ioas(ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {
		if (unmap.iova != 0 || unmap.size != 0) {
			rc = -EINVAL;
			goto err_put;
		}
		rc = iopt_unmap_all(&ioas->iopt, &unmapped);
	} else {
		if (READ_ONCE(ioas->iopt.disable_large_pages)) {
			/*
			 * Create cuts at the start and last of the requested
			 * range. If the start IOVA is 0 then it doesn't need to
			 * be cut.
			 */
			unsigned long iovas[] = { unmap.iova + unmap.size - 1,
						  unmap.iova - 1 };

			rc = iopt_cut_iova(&ioas->iopt, iovas,
					   unmap.iova ? 2 : 1);
			if (rc)
				goto err_put;
		}
		rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,
				     &unmapped);
	}
	unmap.size = unmapped;
	if (copy_to_user(arg, &unmap, minsz))
		rc = -EFAULT;

err_put:
	iommufd_put_object(&ioas->obj);
	return rc;
}

static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)
{
	struct iommufd_hw_pagetable *hwpt;
	struct iommufd_ioas *ioas;
	int rc = 1;

	ioas = get_compat_ioas(ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	mutex_lock(&ioas->mutex);
	list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {
		if (!hwpt->enforce_cache_coherency) {
			rc = 0;
			break;
		}
	}
	mutex_unlock(&ioas->mutex);

	iommufd_put_object(&ioas->obj);
	return rc;
}

static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
					unsigned long type)
{
	switch (type) {
	case VFIO_TYPE1_IOMMU:
	case VFIO_TYPE1v2_IOMMU:
	case VFIO_UNMAP_ALL:
		return 1;

	case VFIO_NOIOMMU_IOMMU:
		return IS_ENABLED(CONFIG_VFIO_NOIOMMU);

	case VFIO_DMA_CC_IOMMU:
		return iommufd_vfio_cc_iommu(ictx);

	/*
	 * This is obsolete, and to be removed from VFIO. It was an incomplete
	 * idea that got merged.
	 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
	 */
	case VFIO_TYPE1_NESTING_IOMMU:
		return 0;

	/*
	 * VFIO_DMA_MAP_FLAG_VADDR
	 * https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/
	 * https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/
	 *
	 * It is hard to see how this could be implemented safely.
	 */
	case VFIO_UPDATE_VADDR:
	default:
		return 0;
	}
}

static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)
{
	bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);
	struct iommufd_ioas *ioas = NULL;
	int rc = 0;

	/*
	 * Emulation for NOIOMMU is imperfect in that VFIO blocks almost all
	 * other ioctls. We let them keep working but they mostly fail since no
	 * IOAS should exist.
	 */
	if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&
	    no_iommu_mode) {
		if (!capable(CAP_SYS_RAWIO))
			return -EPERM;
		return 0;
	}

	if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) ||
	    no_iommu_mode)
		return -EINVAL;

	/* VFIO fails the set_iommu if there is no group */
	ioas = get_compat_ioas(ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	/*
	 * The difference between TYPE1 and TYPE1v2 is the ability to unmap in
	 * the middle of mapped ranges. This is complicated by huge page support
	 * which creates single large IOPTEs that cannot be split by the iommu
	 * driver. TYPE1 is very old at this point and likely nothing uses it,
	 * however it is simple enough to emulate by simply disabling the
	 * problematic large IOPTEs. Then we can safely unmap within any range.
	 */
	if (type == VFIO_TYPE1_IOMMU)
		rc = iopt_disable_large_pages(&ioas->iopt);
	iommufd_put_object(&ioas->obj);
	return rc;
}

static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)
{
	struct io_pagetable *iopt = &ioas->iopt;
	unsigned long pgsize_bitmap = ULONG_MAX;
	struct iommu_domain *domain;
	unsigned long index;

	down_read(&iopt->domains_rwsem);
	xa_for_each(&iopt->domains, index, domain)
		pgsize_bitmap &= domain->pgsize_bitmap;

	/* See vfio_update_pgsize_bitmap() */
	if (pgsize_bitmap & ~PAGE_MASK) {
		pgsize_bitmap &= PAGE_MASK;
		pgsize_bitmap |= PAGE_SIZE;
	}
	pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);
	up_read(&iopt->domains_rwsem);
	return pgsize_bitmap;
}

static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,
				 struct vfio_info_cap_header __user *cur,
				 size_t avail)
{
	struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =
		container_of(cur,
			     struct vfio_iommu_type1_info_cap_iova_range __user,
			     header);
	struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {
		.header = {
			.id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,
			.version = 1,
		},
	};
	struct interval_tree_span_iter span;

	interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,
				    ULONG_MAX) {
		struct vfio_iova_range range;

		if (!span.is_hole)
			continue;
		range.start = span.start_hole;
		range.end = span.last_hole;
		if (avail >= struct_size(&cap_iovas, iova_ranges,
					 cap_iovas.nr_iovas + 1) &&
		    copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],
				 &range, sizeof(range)))
			return -EFAULT;
		cap_iovas.nr_iovas++;
	}
	if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&
	    copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))
		return -EFAULT;
	return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);
}

static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,
				      struct vfio_info_cap_header __user *cur,
				      size_t avail)
{
	struct vfio_iommu_type1_info_dma_avail cap_dma = {
		.header = {
			.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,
			.version = 1,
		},
		/*
		 * iommufd's limit is based on the cgroup's memory limit.
		 * Normally vfio would return U16_MAX here, and provide a module
		 * parameter to adjust it. Since S390 qemu userspace actually
		 * pays attention and needs a value bigger than U16_MAX return
		 * U32_MAX.
		 */
		.avail = U32_MAX,
	};

	if (avail >= sizeof(cap_dma) &&
	    copy_to_user(cur, &cap_dma, sizeof(cap_dma)))
		return -EFAULT;
	return sizeof(cap_dma);
}

static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,
				       void __user *arg)
{
	typedef int (*fill_cap_fn)(struct iommufd_ioas *ioas,
				   struct vfio_info_cap_header __user *cur,
				   size_t avail);
	static const fill_cap_fn fill_fns[] = {
		iommufd_fill_cap_dma_avail,
		iommufd_fill_cap_iova,
	};
	size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
	struct vfio_info_cap_header __user *last_cap = NULL;
	struct vfio_iommu_type1_info info = {};
	struct iommufd_ioas *ioas;
	size_t total_cap_size;
	int rc;
	int i;

	if (copy_from_user(&info, arg, minsz))
		return -EFAULT;

	if (info.argsz < minsz)
		return -EINVAL;
	minsz = min_t(size_t, info.argsz, sizeof(info));

	ioas = get_compat_ioas(ictx);
	if (IS_ERR(ioas))
		return PTR_ERR(ioas);

	info.flags = VFIO_IOMMU_INFO_PGSIZES;
	info.iova_pgsizes = iommufd_get_pagesizes(ioas);
	info.cap_offset = 0;

	down_read(&ioas->iopt.iova_rwsem);
	total_cap_size = sizeof(info);
	for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {
		int cap_size;

		if (info.argsz > total_cap_size)
			cap_size = fill_fns[i](ioas, arg + total_cap_size,
					       info.argsz - total_cap_size);
		else
			cap_size = fill_fns[i](ioas, NULL, 0);
		if (cap_size < 0) {
			rc = cap_size;
			goto out_put;
		}
		if (last_cap && info.argsz >= total_cap_size &&
		    put_user(total_cap_size, &last_cap->next)) {
			rc = -EFAULT;
			goto out_put;
		}
		last_cap = arg + total_cap_size;
		total_cap_size += cap_size;
	}

	/*
	 * If the user did not provide enough space then only some caps are
	 * returned and the argsz will be updated to the correct amount to get
	 * all caps.
	 */
	if (info.argsz >= total_cap_size)
		info.cap_offset = sizeof(info);
	info.argsz = total_cap_size;
	info.flags |= VFIO_IOMMU_INFO_CAPS;
	if (copy_to_user(arg, &info, minsz)) {
		rc = -EFAULT;
		goto out_put;
	}
	rc = 0;

out_put:
	up_read(&ioas->iopt.iova_rwsem);
	iommufd_put_object(&ioas->obj);
	return rc;
}

int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,
		       unsigned long arg)
{
	void __user *uarg = (void __user *)arg;

	switch (cmd) {
	case VFIO_GET_API_VERSION:
		return VFIO_API_VERSION;
	case VFIO_SET_IOMMU:
		return iommufd_vfio_set_iommu(ictx, arg);
	case VFIO_CHECK_EXTENSION:
		return iommufd_vfio_check_extension(ictx, arg);
	case VFIO_IOMMU_GET_INFO:
		return iommufd_vfio_iommu_get_info(ictx, uarg);
	case VFIO_IOMMU_MAP_DMA:
		return iommufd_vfio_map_dma(ictx, cmd, uarg);
	case VFIO_IOMMU_UNMAP_DMA:
		return iommufd_vfio_unmap_dma(ictx, cmd, uarg);
	case VFIO_IOMMU_DIRTY_PAGES:
	default:
		return -ENOIOCTLCMD;
	}
	return -ENOIOCTLCMD;
}
Initial commit 2023-08-30 17:31:07 +02:00			`// SPDX-License-Identifier: GPL-2.0-only`
			`/* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES`
			`*/`
			`#include <linux/file.h>`
			`#include <linux/interval_tree.h>`
			`#include <linux/iommu.h>`
			`#include <linux/iommufd.h>`
			`#include <linux/slab.h>`
			`#include <linux/vfio.h>`
			`#include <uapi/linux/vfio.h>`
			`#include <uapi/linux/iommufd.h>`

			`#include "iommufd_private.h"`

			`static struct iommufd_ioas get_compat_ioas(struct iommufd_ctx ictx)`
			`{`
			`struct iommufd_ioas *ioas = ERR_PTR(-ENODEV);`

			`xa_lock(&ictx->objects);`
			`if (!ictx->vfio_ioas \|\| !iommufd_lock_obj(&ictx->vfio_ioas->obj))`
			`goto out_unlock;`
			`ioas = ictx->vfio_ioas;`
			`out_unlock:`
			`xa_unlock(&ictx->objects);`
			`return ioas;`
			`}`

			`/**`
			`* iommufd_vfio_compat_ioas_get_id - Ensure a compat IOAS exists`
			`* @ictx: Context to operate on`
			`* @out_ioas_id: The IOAS ID of the compatibility IOAS`
			`*`
			`* Return the ID of the current compatibility IOAS. The ID can be passed into`
			`* other functions that take an ioas_id.`
			`*/`
			`int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx ictx, u32 out_ioas_id)`
			`{`
			`struct iommufd_ioas *ioas;`

			`ioas = get_compat_ioas(ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`
			`*out_ioas_id = ioas->obj.id;`
			`iommufd_put_object(&ioas->obj);`
			`return 0;`
			`}`
			`EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_get_id, IOMMUFD_VFIO);`

			`/**`
			`* iommufd_vfio_compat_set_no_iommu - Called when a no-iommu device is attached`
			`* @ictx: Context to operate on`
			`*`
			`* This allows selecting the VFIO_NOIOMMU_IOMMU and blocks normal types.`
			`*/`
			`int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)`
			`{`
			`int ret;`

			`xa_lock(&ictx->objects);`
			`if (!ictx->vfio_ioas) {`
			`ictx->no_iommu_mode = 1;`
			`ret = 0;`
			`} else {`
			`ret = -EINVAL;`
			`}`
			`xa_unlock(&ictx->objects);`
			`return ret;`
			`}`
			`EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_set_no_iommu, IOMMUFD_VFIO);`

			`/**`
			`* iommufd_vfio_compat_ioas_create - Ensure the compat IOAS is created`
			`* @ictx: Context to operate on`
			`*`
			`* The compatibility IOAS is the IOAS that the vfio compatibility ioctls operate`
			`* on since they do not have an IOAS ID input in their ABI. Only attaching a`
			`* group should cause a default creation of the internal ioas, this does nothing`
			`* if an existing ioas has already been assigned somehow.`
			`*/`
			`int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)`
			`{`
			`struct iommufd_ioas *ioas = NULL;`
			`int ret;`

			`ioas = iommufd_ioas_alloc(ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`

			`xa_lock(&ictx->objects);`
			`/*`
			`* VFIO won't allow attaching a container to both iommu and no iommu`
			`* operation`
			`*/`
			`if (ictx->no_iommu_mode) {`
			`ret = -EINVAL;`
			`goto out_abort;`
			`}`

			`if (ictx->vfio_ioas && iommufd_lock_obj(&ictx->vfio_ioas->obj)) {`
			`ret = 0;`
			`iommufd_put_object(&ictx->vfio_ioas->obj);`
			`goto out_abort;`
			`}`
			`ictx->vfio_ioas = ioas;`
			`xa_unlock(&ictx->objects);`

			`/*`
			`* An automatically created compat IOAS is treated as a userspace`
			`* created object. Userspace can learn the ID via IOMMU_VFIO_IOAS_GET,`
			`* and if not manually destroyed it will be destroyed automatically`
			`* at iommufd release.`
			`*/`
			`iommufd_object_finalize(ictx, &ioas->obj);`
			`return 0;`

			`out_abort:`
			`xa_unlock(&ictx->objects);`
			`iommufd_object_abort(ictx, &ioas->obj);`
			`return ret;`
			`}`
			`EXPORT_SYMBOL_NS_GPL(iommufd_vfio_compat_ioas_create, IOMMUFD_VFIO);`

			`int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd)`
			`{`
			`struct iommu_vfio_ioas *cmd = ucmd->cmd;`
			`struct iommufd_ioas *ioas;`

			`if (cmd->__reserved)`
			`return -EOPNOTSUPP;`
			`switch (cmd->op) {`
			`case IOMMU_VFIO_IOAS_GET:`
			`ioas = get_compat_ioas(ucmd->ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`
			`cmd->ioas_id = ioas->obj.id;`
			`iommufd_put_object(&ioas->obj);`
			`return iommufd_ucmd_respond(ucmd, sizeof(*cmd));`

			`case IOMMU_VFIO_IOAS_SET:`
6.5.5 2023-10-24 12:59:35 +02:00			`ioas = iommufd_get_ioas(ucmd->ictx, cmd->ioas_id);`
Initial commit 2023-08-30 17:31:07 +02:00			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`
			`xa_lock(&ucmd->ictx->objects);`
			`ucmd->ictx->vfio_ioas = ioas;`
			`xa_unlock(&ucmd->ictx->objects);`
			`iommufd_put_object(&ioas->obj);`
			`return 0;`

			`case IOMMU_VFIO_IOAS_CLEAR:`
			`xa_lock(&ucmd->ictx->objects);`
			`ucmd->ictx->vfio_ioas = NULL;`
			`xa_unlock(&ucmd->ictx->objects);`
			`return 0;`
			`default:`
			`return -EOPNOTSUPP;`
			`}`
			`}`

			`static int iommufd_vfio_map_dma(struct iommufd_ctx *ictx, unsigned int cmd,`
			`void __user *arg)`
			`{`
			`u32 supported_flags = VFIO_DMA_MAP_FLAG_READ \| VFIO_DMA_MAP_FLAG_WRITE;`
			`size_t minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);`
			`struct vfio_iommu_type1_dma_map map;`
			`int iommu_prot = IOMMU_CACHE;`
			`struct iommufd_ioas *ioas;`
			`unsigned long iova;`
			`int rc;`

			`if (copy_from_user(&map, arg, minsz))`
			`return -EFAULT;`

			`if (map.argsz < minsz \|\| map.flags & ~supported_flags)`
			`return -EINVAL;`

			`if (map.flags & VFIO_DMA_MAP_FLAG_READ)`
			`iommu_prot \|= IOMMU_READ;`
			`if (map.flags & VFIO_DMA_MAP_FLAG_WRITE)`
			`iommu_prot \|= IOMMU_WRITE;`

			`ioas = get_compat_ioas(ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`

			`/*`
			`* Maps created through the legacy interface always use VFIO compatible`
			`* rlimit accounting. If the user wishes to use the faster user based`
			`* rlimit accounting then they must use the new interface.`
			`*/`
			`iova = map.iova;`
			`rc = iopt_map_user_pages(ictx, &ioas->iopt, &iova, u64_to_user_ptr(map.vaddr),`
			`map.size, iommu_prot, 0);`
			`iommufd_put_object(&ioas->obj);`
			`return rc;`
			`}`

			`static int iommufd_vfio_unmap_dma(struct iommufd_ctx *ictx, unsigned int cmd,`
			`void __user *arg)`
			`{`
			`size_t minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);`
			`/*`
			`* VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is obsoleted by the new`
			`* dirty tracking direction:`
			`* https://lore.kernel.org/kvm/20220731125503.142683-1-yishaih@nvidia.com/`
			`* https://lore.kernel.org/kvm/20220428210933.3583-1-joao.m.martins@oracle.com/`
			`*/`
			`u32 supported_flags = VFIO_DMA_UNMAP_FLAG_ALL;`
			`struct vfio_iommu_type1_dma_unmap unmap;`
			`unsigned long unmapped = 0;`
			`struct iommufd_ioas *ioas;`
			`int rc;`

			`if (copy_from_user(&unmap, arg, minsz))`
			`return -EFAULT;`

			`if (unmap.argsz < minsz \|\| unmap.flags & ~supported_flags)`
			`return -EINVAL;`

			`ioas = get_compat_ioas(ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`

			`if (unmap.flags & VFIO_DMA_UNMAP_FLAG_ALL) {`
			`if (unmap.iova != 0 \|\| unmap.size != 0) {`
			`rc = -EINVAL;`
			`goto err_put;`
			`}`
			`rc = iopt_unmap_all(&ioas->iopt, &unmapped);`
			`} else {`
			`if (READ_ONCE(ioas->iopt.disable_large_pages)) {`
			`/*`
			`* Create cuts at the start and last of the requested`
			`* range. If the start IOVA is 0 then it doesn't need to`
			`* be cut.`
			`*/`
			`unsigned long iovas[] = { unmap.iova + unmap.size - 1,`
			`unmap.iova - 1 };`

			`rc = iopt_cut_iova(&ioas->iopt, iovas,`
			`unmap.iova ? 2 : 1);`
			`if (rc)`
			`goto err_put;`
			`}`
			`rc = iopt_unmap_iova(&ioas->iopt, unmap.iova, unmap.size,`
			`&unmapped);`
			`}`
			`unmap.size = unmapped;`
			`if (copy_to_user(arg, &unmap, minsz))`
			`rc = -EFAULT;`

			`err_put:`
			`iommufd_put_object(&ioas->obj);`
			`return rc;`
			`}`

			`static int iommufd_vfio_cc_iommu(struct iommufd_ctx *ictx)`
			`{`
			`struct iommufd_hw_pagetable *hwpt;`
			`struct iommufd_ioas *ioas;`
			`int rc = 1;`

			`ioas = get_compat_ioas(ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`

			`mutex_lock(&ioas->mutex);`
			`list_for_each_entry(hwpt, &ioas->hwpt_list, hwpt_item) {`
			`if (!hwpt->enforce_cache_coherency) {`
			`rc = 0;`
			`break;`
			`}`
			`}`
			`mutex_unlock(&ioas->mutex);`

			`iommufd_put_object(&ioas->obj);`
			`return rc;`
			`}`

			`static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,`
			`unsigned long type)`
			`{`
			`switch (type) {`
			`case VFIO_TYPE1_IOMMU:`
			`case VFIO_TYPE1v2_IOMMU:`
			`case VFIO_UNMAP_ALL:`
			`return 1;`

			`case VFIO_NOIOMMU_IOMMU:`
			`return IS_ENABLED(CONFIG_VFIO_NOIOMMU);`

			`case VFIO_DMA_CC_IOMMU:`
			`return iommufd_vfio_cc_iommu(ictx);`

			`/*`
			`* This is obsolete, and to be removed from VFIO. It was an incomplete`
			`* idea that got merged.`
			`* https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/`
			`*/`
			`case VFIO_TYPE1_NESTING_IOMMU:`
			`return 0;`

			`/*`
			`* VFIO_DMA_MAP_FLAG_VADDR`
			`* https://lore.kernel.org/kvm/1611939252-7240-1-git-send-email-steven.sistare@oracle.com/`
			`* https://lore.kernel.org/all/Yz777bJZjTyLrHEQ@nvidia.com/`
			`*`
			`* It is hard to see how this could be implemented safely.`
			`*/`
			`case VFIO_UPDATE_VADDR:`
			`default:`
			`return 0;`
			`}`
			`}`

			`static int iommufd_vfio_set_iommu(struct iommufd_ctx *ictx, unsigned long type)`
			`{`
			`bool no_iommu_mode = READ_ONCE(ictx->no_iommu_mode);`
			`struct iommufd_ioas *ioas = NULL;`
			`int rc = 0;`

			`/*`
			`* Emulation for NOIOMMU is imperfect in that VFIO blocks almost all`
			`* other ioctls. We let them keep working but they mostly fail since no`
			`* IOAS should exist.`
			`*/`
			`if (IS_ENABLED(CONFIG_VFIO_NOIOMMU) && type == VFIO_NOIOMMU_IOMMU &&`
			`no_iommu_mode) {`
			`if (!capable(CAP_SYS_RAWIO))`
			`return -EPERM;`
			`return 0;`
			`}`

			`if ((type != VFIO_TYPE1_IOMMU && type != VFIO_TYPE1v2_IOMMU) \|\|`
			`no_iommu_mode)`
			`return -EINVAL;`

			`/* VFIO fails the set_iommu if there is no group */`
			`ioas = get_compat_ioas(ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`

			`/*`
			`* The difference between TYPE1 and TYPE1v2 is the ability to unmap in`
			`* the middle of mapped ranges. This is complicated by huge page support`
			`* which creates single large IOPTEs that cannot be split by the iommu`
			`* driver. TYPE1 is very old at this point and likely nothing uses it,`
			`* however it is simple enough to emulate by simply disabling the`
			`* problematic large IOPTEs. Then we can safely unmap within any range.`
			`*/`
			`if (type == VFIO_TYPE1_IOMMU)`
			`rc = iopt_disable_large_pages(&ioas->iopt);`
			`iommufd_put_object(&ioas->obj);`
			`return rc;`
			`}`

			`static unsigned long iommufd_get_pagesizes(struct iommufd_ioas *ioas)`
			`{`
			`struct io_pagetable *iopt = &ioas->iopt;`
			`unsigned long pgsize_bitmap = ULONG_MAX;`
			`struct iommu_domain *domain;`
			`unsigned long index;`

			`down_read(&iopt->domains_rwsem);`
			`xa_for_each(&iopt->domains, index, domain)`
			`pgsize_bitmap &= domain->pgsize_bitmap;`

			`/* See vfio_update_pgsize_bitmap() */`
			`if (pgsize_bitmap & ~PAGE_MASK) {`
			`pgsize_bitmap &= PAGE_MASK;`
			`pgsize_bitmap \|= PAGE_SIZE;`
			`}`
			`pgsize_bitmap = max(pgsize_bitmap, ioas->iopt.iova_alignment);`
			`up_read(&iopt->domains_rwsem);`
			`return pgsize_bitmap;`
			`}`

			`static int iommufd_fill_cap_iova(struct iommufd_ioas *ioas,`
			`struct vfio_info_cap_header __user *cur,`
			`size_t avail)`
			`{`
			`struct vfio_iommu_type1_info_cap_iova_range __user *ucap_iovas =`
			`container_of(cur,`
			`struct vfio_iommu_type1_info_cap_iova_range __user,`
			`header);`
			`struct vfio_iommu_type1_info_cap_iova_range cap_iovas = {`
			`.header = {`
			`.id = VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE,`
			`.version = 1,`
			`},`
			`};`
			`struct interval_tree_span_iter span;`

			`interval_tree_for_each_span(&span, &ioas->iopt.reserved_itree, 0,`
			`ULONG_MAX) {`
			`struct vfio_iova_range range;`

			`if (!span.is_hole)`
			`continue;`
			`range.start = span.start_hole;`
			`range.end = span.last_hole;`
			`if (avail >= struct_size(&cap_iovas, iova_ranges,`
			`cap_iovas.nr_iovas + 1) &&`
			`copy_to_user(&ucap_iovas->iova_ranges[cap_iovas.nr_iovas],`
			`&range, sizeof(range)))`
			`return -EFAULT;`
			`cap_iovas.nr_iovas++;`
			`}`
			`if (avail >= struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas) &&`
			`copy_to_user(ucap_iovas, &cap_iovas, sizeof(cap_iovas)))`
			`return -EFAULT;`
			`return struct_size(&cap_iovas, iova_ranges, cap_iovas.nr_iovas);`
			`}`

			`static int iommufd_fill_cap_dma_avail(struct iommufd_ioas *ioas,`
			`struct vfio_info_cap_header __user *cur,`
			`size_t avail)`
			`{`
			`struct vfio_iommu_type1_info_dma_avail cap_dma = {`
			`.header = {`
			`.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL,`
			`.version = 1,`
			`},`
			`/*`
			`* iommufd's limit is based on the cgroup's memory limit.`
			`* Normally vfio would return U16_MAX here, and provide a module`
			`* parameter to adjust it. Since S390 qemu userspace actually`
			`* pays attention and needs a value bigger than U16_MAX return`
			`* U32_MAX.`
			`*/`
			`.avail = U32_MAX,`
			`};`

			`if (avail >= sizeof(cap_dma) &&`
			`copy_to_user(cur, &cap_dma, sizeof(cap_dma)))`
			`return -EFAULT;`
			`return sizeof(cap_dma);`
			`}`

			`static int iommufd_vfio_iommu_get_info(struct iommufd_ctx *ictx,`
			`void __user *arg)`
			`{`
			`typedef int (fill_cap_fn)(struct iommufd_ioas ioas,`
			`struct vfio_info_cap_header __user *cur,`
			`size_t avail);`
			`static const fill_cap_fn fill_fns[] = {`
			`iommufd_fill_cap_dma_avail,`
			`iommufd_fill_cap_iova,`
			`};`
			`size_t minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);`
			`struct vfio_info_cap_header __user *last_cap = NULL;`
			`struct vfio_iommu_type1_info info = {};`
			`struct iommufd_ioas *ioas;`
			`size_t total_cap_size;`
			`int rc;`
			`int i;`

			`if (copy_from_user(&info, arg, minsz))`
			`return -EFAULT;`

			`if (info.argsz < minsz)`
			`return -EINVAL;`
			`minsz = min_t(size_t, info.argsz, sizeof(info));`

			`ioas = get_compat_ioas(ictx);`
			`if (IS_ERR(ioas))`
			`return PTR_ERR(ioas);`

			`info.flags = VFIO_IOMMU_INFO_PGSIZES;`
			`info.iova_pgsizes = iommufd_get_pagesizes(ioas);`
			`info.cap_offset = 0;`

			`down_read(&ioas->iopt.iova_rwsem);`
			`total_cap_size = sizeof(info);`
			`for (i = 0; i != ARRAY_SIZE(fill_fns); i++) {`
			`int cap_size;`

			`if (info.argsz > total_cap_size)`
			`cap_size = fill_fns[i](ioas, arg + total_cap_size,`
			`info.argsz - total_cap_size);`
			`else`
			`cap_size = fill_fns[i](ioas, NULL, 0);`
			`if (cap_size < 0) {`
			`rc = cap_size;`
			`goto out_put;`
			`}`
			`if (last_cap && info.argsz >= total_cap_size &&`
			`put_user(total_cap_size, &last_cap->next)) {`
			`rc = -EFAULT;`
			`goto out_put;`
			`}`
			`last_cap = arg + total_cap_size;`
			`total_cap_size += cap_size;`
			`}`

			`/*`
			`* If the user did not provide enough space then only some caps are`
			`* returned and the argsz will be updated to the correct amount to get`
			`* all caps.`
			`*/`
			`if (info.argsz >= total_cap_size)`
			`info.cap_offset = sizeof(info);`
			`info.argsz = total_cap_size;`
			`info.flags \|= VFIO_IOMMU_INFO_CAPS;`
			`if (copy_to_user(arg, &info, minsz)) {`
			`rc = -EFAULT;`
			`goto out_put;`
			`}`
			`rc = 0;`

			`out_put:`
			`up_read(&ioas->iopt.iova_rwsem);`
			`iommufd_put_object(&ioas->obj);`
			`return rc;`
			`}`

			`int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd,`
			`unsigned long arg)`
			`{`
			`void __user uarg = (void __user )arg;`

			`switch (cmd) {`
			`case VFIO_GET_API_VERSION:`
			`return VFIO_API_VERSION;`
			`case VFIO_SET_IOMMU:`
			`return iommufd_vfio_set_iommu(ictx, arg);`
			`case VFIO_CHECK_EXTENSION:`
			`return iommufd_vfio_check_extension(ictx, arg);`
			`case VFIO_IOMMU_GET_INFO:`
			`return iommufd_vfio_iommu_get_info(ictx, uarg);`
			`case VFIO_IOMMU_MAP_DMA:`
			`return iommufd_vfio_map_dma(ictx, cmd, uarg);`
			`case VFIO_IOMMU_UNMAP_DMA:`
			`return iommufd_vfio_unmap_dma(ictx, cmd, uarg);`
			`case VFIO_IOMMU_DIRTY_PAGES:`
			`default:`
			`return -ENOIOCTLCMD;`
			`}`
			`return -ENOIOCTLCMD;`
			`}`