463 lines
13 KiB
C
463 lines
13 KiB
C
|
// SPDX-License-Identifier: GPL-2.0
|
||
|
/*
|
||
|
* Intel remapped NVMe device support.
|
||
|
*
|
||
|
* Copyright (c) 2019 Endless Mobile, Inc.
|
||
|
* Author: Daniel Drake <drake@endlessm.com>
|
||
|
*
|
||
|
* Some products ship by default with the SATA controller in "RAID" or
|
||
|
* "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
|
||
|
* mode, which we refer to as "remapped NVMe" mode, any installed NVMe
|
||
|
* devices disappear from the PCI bus, and instead their I/O memory becomes
|
||
|
* available within the AHCI device BARs.
|
||
|
*
|
||
|
* This scheme is understood to be a way of avoiding usage of the standard
|
||
|
* Windows NVMe driver under that OS, instead mandating usage of Intel's
|
||
|
* driver instead, which has better power management, and presumably offers
|
||
|
* some RAID/disk-caching solutions too.
|
||
|
*
|
||
|
* Here in this driver, we support the remapped NVMe mode by claiming the
|
||
|
* AHCI device and creating a fake PCIe root port. On the new bus, the
|
||
|
* original AHCI device is exposed with only minor tweaks. Then, fake PCI
|
||
|
* devices corresponding to the remapped NVMe devices are created. The usual
|
||
|
* ahci and nvme drivers are then expected to bind to these devices and
|
||
|
* operate as normal.
|
||
|
*
|
||
|
* The PCI configuration space for the NVMe devices is completely
|
||
|
* unavailable, so we fake a minimal one and hope for the best.
|
||
|
*
|
||
|
* Interrupts are shared between the AHCI and NVMe devices. For simplicity,
|
||
|
* we only support the legacy interrupt here, although MSI support
|
||
|
* could potentially be added later.
|
||
|
*/
|
||
|
|
||
|
#define MODULE_NAME "intel-nvme-remap"
|
||
|
|
||
|
#include <linux/ahci-remap.h>
|
||
|
#include <linux/irq.h>
|
||
|
#include <linux/kernel.h>
|
||
|
#include <linux/module.h>
|
||
|
#include <linux/pci.h>
|
||
|
|
||
|
#define AHCI_PCI_BAR_STANDARD 5
|
||
|
|
||
|
struct nvme_remap_dev {
|
||
|
struct pci_dev *dev; /* AHCI device */
|
||
|
struct pci_bus *bus; /* our fake PCI bus */
|
||
|
struct pci_sysdata sysdata;
|
||
|
int irq_base; /* our fake interrupts */
|
||
|
|
||
|
/*
|
||
|
* When we detect an all-ones write to a BAR register, this flag
|
||
|
* is set, so that we return the BAR size on the next read (a
|
||
|
* standard PCI behaviour).
|
||
|
* This includes the assumption that an all-ones BAR write is
|
||
|
* immediately followed by a read of the same register.
|
||
|
*/
|
||
|
bool bar_sizing;
|
||
|
|
||
|
/*
|
||
|
* Resources copied from the AHCI device, to be regarded as
|
||
|
* resources on our fake bus.
|
||
|
*/
|
||
|
struct resource ahci_resources[PCI_NUM_RESOURCES];
|
||
|
|
||
|
/* Resources corresponding to the NVMe devices. */
|
||
|
struct resource remapped_dev_mem[AHCI_MAX_REMAP];
|
||
|
|
||
|
/* Number of remapped NVMe devices found. */
|
||
|
int num_remapped_devices;
|
||
|
};
|
||
|
|
||
|
static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
|
||
|
{
|
||
|
return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
|
||
|
}
|
||
|
|
||
|
|
||
|
/******** PCI configuration space **********/
|
||
|
|
||
|
/*
|
||
|
* Helper macros for tweaking returned contents of PCI configuration space.
|
||
|
*
|
||
|
* value contains len bytes of data read from reg.
|
||
|
* If fixup_reg is included in that range, fix up the contents of that
|
||
|
* register to fixed_value.
|
||
|
*/
|
||
|
#define NR_FIX8(fixup_reg, fixed_value) do { \
|
||
|
if (reg <= fixup_reg && fixup_reg < reg + len) \
|
||
|
((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
|
||
|
} while (0)
|
||
|
|
||
|
#define NR_FIX16(fixup_reg, fixed_value) do { \
|
||
|
NR_FIX8(fixup_reg, fixed_value); \
|
||
|
NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
|
||
|
} while (0)
|
||
|
|
||
|
#define NR_FIX24(fixup_reg, fixed_value) do { \
|
||
|
NR_FIX8(fixup_reg, fixed_value); \
|
||
|
NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
|
||
|
NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
|
||
|
} while (0)
|
||
|
|
||
|
#define NR_FIX32(fixup_reg, fixed_value) do { \
|
||
|
NR_FIX16(fixup_reg, (u16) fixed_value); \
|
||
|
NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
|
||
|
} while (0)
|
||
|
|
||
|
/*
|
||
|
* Read PCI config space of the slot 0 (AHCI) device.
|
||
|
* We pass through the read request to the underlying device, but
|
||
|
* tweak the results in some cases.
|
||
|
*/
|
||
|
static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
|
||
|
int len, u32 *value)
|
||
|
{
|
||
|
struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
||
|
struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
|
||
|
int ret;
|
||
|
|
||
|
ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
|
||
|
reg, len, value);
|
||
|
if (ret)
|
||
|
return ret;
|
||
|
|
||
|
/*
|
||
|
* Adjust the device class, to prevent this driver from attempting to
|
||
|
* additionally probe the device we're simulating here.
|
||
|
*/
|
||
|
NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
|
||
|
|
||
|
/*
|
||
|
* Unset interrupt pin, otherwise ACPI tries to find routing
|
||
|
* info for our virtual IRQ, fails, and complains.
|
||
|
*/
|
||
|
NR_FIX8(PCI_INTERRUPT_PIN, 0);
|
||
|
|
||
|
/*
|
||
|
* Truncate the AHCI BAR to not include the region that covers the
|
||
|
* hidden devices. This will cause the ahci driver to successfully
|
||
|
* probe th new device (instead of handing it over to this driver).
|
||
|
*/
|
||
|
if (nrdev->bar_sizing) {
|
||
|
NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
|
||
|
nrdev->bar_sizing = false;
|
||
|
}
|
||
|
|
||
|
return PCIBIOS_SUCCESSFUL;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Read PCI config space of a remapped device.
|
||
|
* Since the original PCI config space is inaccessible, we provide a minimal,
|
||
|
* fake config space instead.
|
||
|
*/
|
||
|
static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
|
||
|
int reg, int len, u32 *value)
|
||
|
{
|
||
|
struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
||
|
struct resource *remapped_mem;
|
||
|
|
||
|
if (port > nrdev->num_remapped_devices)
|
||
|
return PCIBIOS_DEVICE_NOT_FOUND;
|
||
|
|
||
|
*value = 0;
|
||
|
remapped_mem = &nrdev->remapped_dev_mem[port - 1];
|
||
|
|
||
|
/* Set a Vendor ID, otherwise Linux assumes no device is present */
|
||
|
NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
|
||
|
|
||
|
/* Always appear on & bus mastering */
|
||
|
NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
|
||
|
|
||
|
/* Set class so that nvme driver probes us */
|
||
|
NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
|
||
|
|
||
|
if (nrdev->bar_sizing) {
|
||
|
NR_FIX32(PCI_BASE_ADDRESS_0,
|
||
|
~(resource_size(remapped_mem) - 1));
|
||
|
nrdev->bar_sizing = false;
|
||
|
} else {
|
||
|
resource_size_t mem_start = remapped_mem->start;
|
||
|
|
||
|
mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
|
||
|
NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
|
||
|
mem_start >>= 32;
|
||
|
NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
|
||
|
}
|
||
|
|
||
|
return PCIBIOS_SUCCESSFUL;
|
||
|
}
|
||
|
|
||
|
/* Read PCI configuration space. */
|
||
|
static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
|
||
|
int reg, int len, u32 *value)
|
||
|
{
|
||
|
if (PCI_SLOT(devfn) == 0)
|
||
|
return nvme_remap_pci_read_slot0(bus, reg, len, value);
|
||
|
else
|
||
|
return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
|
||
|
reg, len, value);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Write PCI config space of the slot 0 (AHCI) device.
|
||
|
* Apart from the special case of BAR sizing, we disable all writes.
|
||
|
* Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
|
||
|
* that would affect the operation of the NVMe devices.
|
||
|
*/
|
||
|
static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
|
||
|
int len, u32 value)
|
||
|
{
|
||
|
struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
||
|
struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
|
||
|
|
||
|
if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
|
||
|
/*
|
||
|
* Writing all-ones to a BAR means that the size of the
|
||
|
* memory region is being checked. Flag this so that we can
|
||
|
* reply with an appropriate size on the next read.
|
||
|
*/
|
||
|
if (value == ~0)
|
||
|
nrdev->bar_sizing = true;
|
||
|
|
||
|
return ahci_dev_bus->ops->write(ahci_dev_bus,
|
||
|
nrdev->dev->devfn,
|
||
|
reg, len, value);
|
||
|
}
|
||
|
|
||
|
return PCIBIOS_SET_FAILED;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Write PCI config space of a remapped device.
|
||
|
* Since the original PCI config space is inaccessible, we reject all
|
||
|
* writes, except for the special case of BAR probing.
|
||
|
*/
|
||
|
static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
|
||
|
unsigned int port,
|
||
|
int reg, int len, u32 value)
|
||
|
{
|
||
|
struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
|
||
|
|
||
|
if (port > nrdev->num_remapped_devices)
|
||
|
return PCIBIOS_DEVICE_NOT_FOUND;
|
||
|
|
||
|
/*
|
||
|
* Writing all-ones to a BAR means that the size of the memory
|
||
|
* region is being checked. Flag this so that we can reply with
|
||
|
* an appropriate size on the next read.
|
||
|
*/
|
||
|
if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
|
||
|
&& reg <= PCI_BASE_ADDRESS_5) {
|
||
|
nrdev->bar_sizing = true;
|
||
|
return PCIBIOS_SUCCESSFUL;
|
||
|
}
|
||
|
|
||
|
return PCIBIOS_SET_FAILED;
|
||
|
}
|
||
|
|
||
|
/* Write PCI configuration space. */
|
||
|
static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
|
||
|
int reg, int len, u32 value)
|
||
|
{
|
||
|
if (PCI_SLOT(devfn) == 0)
|
||
|
return nvme_remap_pci_write_slot0(bus, reg, len, value);
|
||
|
else
|
||
|
return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
|
||
|
reg, len, value);
|
||
|
}
|
||
|
|
||
|
static struct pci_ops nvme_remap_pci_ops = {
|
||
|
.read = nvme_remap_pci_read,
|
||
|
.write = nvme_remap_pci_write,
|
||
|
};
|
||
|
|
||
|
|
||
|
/******** Initialization & exit **********/
|
||
|
|
||
|
/*
|
||
|
* Find a PCI domain ID to use for our fake bus.
|
||
|
* Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
|
||
|
*/
|
||
|
static int find_free_domain(void)
|
||
|
{
|
||
|
int domain = 0xffff;
|
||
|
struct pci_bus *bus = NULL;
|
||
|
|
||
|
while ((bus = pci_find_next_bus(bus)) != NULL)
|
||
|
domain = max_t(int, domain, pci_domain_nr(bus));
|
||
|
|
||
|
return domain + 1;
|
||
|
}
|
||
|
|
||
|
static int find_remapped_devices(struct nvme_remap_dev *nrdev,
|
||
|
struct list_head *resources)
|
||
|
{
|
||
|
void __iomem *mmio;
|
||
|
int i, count = 0;
|
||
|
u32 cap;
|
||
|
|
||
|
mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
|
||
|
pci_resource_len(nrdev->dev,
|
||
|
AHCI_PCI_BAR_STANDARD));
|
||
|
if (!mmio)
|
||
|
return -ENODEV;
|
||
|
|
||
|
/* Check if this device might have remapped nvme devices. */
|
||
|
if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
|
||
|
!(readl(mmio + AHCI_VSCAP) & 1))
|
||
|
return -ENODEV;
|
||
|
|
||
|
cap = readq(mmio + AHCI_REMAP_CAP);
|
||
|
for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
|
||
|
struct resource *remapped_mem;
|
||
|
|
||
|
if ((cap & (1 << i)) == 0)
|
||
|
continue;
|
||
|
if (readl(mmio + ahci_remap_dcc(i))
|
||
|
!= PCI_CLASS_STORAGE_EXPRESS)
|
||
|
continue;
|
||
|
|
||
|
/* We've found a remapped device */
|
||
|
remapped_mem = &nrdev->remapped_dev_mem[count++];
|
||
|
remapped_mem->start =
|
||
|
pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
|
||
|
+ ahci_remap_base(i);
|
||
|
remapped_mem->end = remapped_mem->start
|
||
|
+ AHCI_REMAP_N_SIZE - 1;
|
||
|
remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
|
||
|
pci_add_resource(resources, remapped_mem);
|
||
|
}
|
||
|
|
||
|
pcim_iounmap(nrdev->dev, mmio);
|
||
|
|
||
|
if (count == 0)
|
||
|
return -ENODEV;
|
||
|
|
||
|
nrdev->num_remapped_devices = count;
|
||
|
dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
|
||
|
nrdev->num_remapped_devices);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static void nvme_remap_remove_root_bus(void *data)
|
||
|
{
|
||
|
struct pci_bus *bus = data;
|
||
|
|
||
|
pci_stop_root_bus(bus);
|
||
|
pci_remove_root_bus(bus);
|
||
|
}
|
||
|
|
||
|
static int nvme_remap_probe(struct pci_dev *dev,
|
||
|
const struct pci_device_id *id)
|
||
|
{
|
||
|
struct nvme_remap_dev *nrdev;
|
||
|
LIST_HEAD(resources);
|
||
|
int i;
|
||
|
int ret;
|
||
|
struct pci_dev *child;
|
||
|
|
||
|
nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
|
||
|
nrdev->sysdata.domain = find_free_domain();
|
||
|
nrdev->sysdata.nvme_remap_dev = dev;
|
||
|
nrdev->dev = dev;
|
||
|
pci_set_drvdata(dev, nrdev);
|
||
|
|
||
|
ret = pcim_enable_device(dev);
|
||
|
if (ret < 0)
|
||
|
return ret;
|
||
|
|
||
|
pci_set_master(dev);
|
||
|
|
||
|
ret = find_remapped_devices(nrdev, &resources);
|
||
|
if (ret)
|
||
|
return ret;
|
||
|
|
||
|
/* Add resources from the original AHCI device */
|
||
|
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
|
||
|
struct resource *res = &dev->resource[i];
|
||
|
|
||
|
if (res->start) {
|
||
|
struct resource *nr_res = &nrdev->ahci_resources[i];
|
||
|
|
||
|
nr_res->start = res->start;
|
||
|
nr_res->end = res->end;
|
||
|
nr_res->flags = res->flags;
|
||
|
pci_add_resource(&resources, nr_res);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* Create virtual interrupts */
|
||
|
nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
|
||
|
nrdev->num_remapped_devices + 1,
|
||
|
0);
|
||
|
if (nrdev->irq_base < 0)
|
||
|
return nrdev->irq_base;
|
||
|
|
||
|
/* Create and populate PCI bus */
|
||
|
nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
|
||
|
&nrdev->sysdata, &resources);
|
||
|
if (!nrdev->bus)
|
||
|
return -ENODEV;
|
||
|
|
||
|
if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
|
||
|
nrdev->bus))
|
||
|
return -ENOMEM;
|
||
|
|
||
|
/* We don't support sharing MSI interrupts between these devices */
|
||
|
nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
|
||
|
|
||
|
pci_scan_child_bus(nrdev->bus);
|
||
|
|
||
|
list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
|
||
|
/*
|
||
|
* Prevent PCI core from trying to move memory BARs around.
|
||
|
* The hidden NVMe devices are at fixed locations.
|
||
|
*/
|
||
|
for (i = 0; i < PCI_NUM_RESOURCES; i++) {
|
||
|
struct resource *res = &child->resource[i];
|
||
|
|
||
|
if (res->flags & IORESOURCE_MEM)
|
||
|
res->flags |= IORESOURCE_PCI_FIXED;
|
||
|
}
|
||
|
|
||
|
/* Share the legacy IRQ between all devices */
|
||
|
child->irq = dev->irq;
|
||
|
}
|
||
|
|
||
|
pci_assign_unassigned_bus_resources(nrdev->bus);
|
||
|
pci_bus_add_devices(nrdev->bus);
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static const struct pci_device_id nvme_remap_ids[] = {
|
||
|
/*
|
||
|
* Match all Intel RAID controllers.
|
||
|
*
|
||
|
* There's overlap here with the set of devices detected by the ahci
|
||
|
* driver, but ahci will only successfully probe when there
|
||
|
* *aren't* any remapped NVMe devices, and this driver will only
|
||
|
* successfully probe when there *are* remapped NVMe devices that
|
||
|
* need handling.
|
||
|
*/
|
||
|
{
|
||
|
PCI_VDEVICE(INTEL, PCI_ANY_ID),
|
||
|
.class = PCI_CLASS_STORAGE_RAID << 8,
|
||
|
.class_mask = 0xffffff00,
|
||
|
},
|
||
|
{0,}
|
||
|
};
|
||
|
MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
|
||
|
|
||
|
static struct pci_driver nvme_remap_drv = {
|
||
|
.name = MODULE_NAME,
|
||
|
.id_table = nvme_remap_ids,
|
||
|
.probe = nvme_remap_probe,
|
||
|
};
|
||
|
module_pci_driver(nvme_remap_drv);
|
||
|
|
||
|
MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
|
||
|
MODULE_LICENSE("GPL v2");
|