powerpc/powernv: Improve kexec reliability
[platform/adaptation/renesas_rcar/renesas_kernel.git] / arch / powerpc / platforms / powernv / pci-ioda.c
index 8e90e89..3937aaa 100644 (file)
 #include <asm/prom.h>
 #include <asm/pci-bridge.h>
 #include <asm/machdep.h>
+#include <asm/msi_bitmap.h>
 #include <asm/ppc-pci.h>
 #include <asm/opal.h>
 #include <asm/iommu.h>
 #include <asm/tce.h>
+#include <asm/xics.h>
 
 #include "powernv.h"
 #include "pci.h"
@@ -87,6 +89,7 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
                        return IODA_INVALID_PE;
        } while(test_and_set_bit(pe, phb->ioda.pe_alloc));
 
+       phb->ioda.pe_array[pe].phb = phb;
        phb->ioda.pe_array[pe].pe_number = pe;
        return pe;
 }
@@ -431,22 +434,102 @@ static void pnv_pci_ioda_setup_PEs(void)
        }
 }
 
-static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *dev)
+static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, struct pci_dev *pdev)
 {
-       /* We delay DMA setup after we have assigned all PE# */
+       struct pci_dn *pdn = pnv_ioda_get_pdn(pdev);
+       struct pnv_ioda_pe *pe;
+
+       /*
+        * The function can be called while the PE#
+        * hasn't been assigned. Do nothing for the
+        * case.
+        */
+       if (!pdn || pdn->pe_number == IODA_INVALID_PE)
+               return;
+
+       pe = &phb->ioda.pe_array[pdn->pe_number];
+       set_iommu_table_base(&pdev->dev, &pe->tce32_table);
 }
 
-static void pnv_ioda_setup_bus_dma(struct pnv_ioda_pe *pe, struct pci_bus *bus)
+static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
+                                        u64 *startp, u64 *endp)
 {
-       struct pci_dev *dev;
+       u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index;
+       unsigned long start, end, inc;
+
+       start = __pa(startp);
+       end = __pa(endp);
+
+       /* BML uses this case for p6/p7/galaxy2: Shift addr and put in node */
+       if (tbl->it_busno) {
+               start <<= 12;
+               end <<= 12;
+               inc = 128 << 12;
+               start |= tbl->it_busno;
+               end |= tbl->it_busno;
+       } else if (tbl->it_type & TCE_PCI_SWINV_PAIR) {
+               /* p7ioc-style invalidation, 2 TCEs per write */
+               start |= (1ull << 63);
+               end |= (1ull << 63);
+               inc = 16;
+        } else {
+               /* Default (older HW) */
+                inc = 128;
+       }
 
-       list_for_each_entry(dev, &bus->devices, bus_list) {
-               set_iommu_table_base(&dev->dev, &pe->tce32_table);
-               if (dev->subordinate)
-                       pnv_ioda_setup_bus_dma(pe, dev->subordinate);
+        end |= inc - 1;        /* round up end to be different than start */
+
+        mb(); /* Ensure above stores are visible */
+        while (start <= end) {
+                __raw_writeq(start, invalidate);
+                start += inc;
+        }
+
+       /*
+        * The iommu layer will do another mb() for us on build()
+        * and we don't care on free()
+        */
+}
+
+static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
+                                        struct iommu_table *tbl,
+                                        u64 *startp, u64 *endp)
+{
+       unsigned long start, end, inc;
+       u64 __iomem *invalidate = (u64 __iomem *)tbl->it_index;
+
+       /* We'll invalidate DMA address in PE scope */
+       start = 0x2ul << 60;
+       start |= (pe->pe_number & 0xFF);
+       end = start;
+
+       /* Figure out the start, end and step */
+       inc = tbl->it_offset + (((u64)startp - tbl->it_base) / sizeof(u64));
+       start |= (inc << 12);
+       inc = tbl->it_offset + (((u64)endp - tbl->it_base) / sizeof(u64));
+       end |= (inc << 12);
+       inc = (0x1ul << 12);
+       mb();
+
+       while (start <= end) {
+               __raw_writeq(start, invalidate);
+               start += inc;
        }
 }
 
+void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
+                                u64 *startp, u64 *endp)
+{
+       struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
+                                             tce32_table);
+       struct pnv_phb *phb = pe->phb;
+
+       if (phb->type == PNV_PHB_IODA1)
+               pnv_pci_ioda1_tce_invalidate(tbl, startp, endp);
+       else
+               pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp);
+}
+
 static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
                                      struct pnv_ioda_pe *pe, unsigned int base,
                                      unsigned int segs)
@@ -518,16 +601,11 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
                 */
                tbl->it_busno = 0;
                tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8);
-               tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE
-                       | TCE_PCI_SWINV_PAIR;
+               tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE |
+                              TCE_PCI_SWINV_PAIR;
        }
        iommu_init_table(tbl, phb->hose->node);
 
-       if (pe->pdev)
-               set_iommu_table_base(&pe->pdev->dev, tbl);
-       else
-               pnv_ioda_setup_bus_dma(pe, pe->pbus);
-
        return;
  fail:
        /* XXX Failure: Try to fallback to 64-bit only ? */
@@ -537,6 +615,76 @@ static void pnv_pci_ioda_setup_dma_pe(struct pnv_phb *phb,
                __free_pages(tce_mem, get_order(TCE32_TABLE_SIZE * segs));
 }
 
+static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
+                                      struct pnv_ioda_pe *pe)
+{
+       struct page *tce_mem = NULL;
+       void *addr;
+       const __be64 *swinvp;
+       struct iommu_table *tbl;
+       unsigned int tce_table_size, end;
+       int64_t rc;
+
+       /* We shouldn't already have a 32-bit DMA associated */
+       if (WARN_ON(pe->tce32_seg >= 0))
+               return;
+
+       /* The PE will reserve all possible 32-bits space */
+       pe->tce32_seg = 0;
+       end = (1 << ilog2(phb->ioda.m32_pci_base));
+       tce_table_size = (end / 0x1000) * 8;
+       pe_info(pe, "Setting up 32-bit TCE table at 0..%08x\n",
+               end);
+
+       /* Allocate TCE table */
+       tce_mem = alloc_pages_node(phb->hose->node, GFP_KERNEL,
+                                  get_order(tce_table_size));
+       if (!tce_mem) {
+               pe_err(pe, "Failed to allocate a 32-bit TCE memory\n");
+               goto fail;
+       }
+       addr = page_address(tce_mem);
+       memset(addr, 0, tce_table_size);
+
+       /*
+        * Map TCE table through TVT. The TVE index is the PE number
+        * shifted by 1 bit for 32-bits DMA space.
+        */
+       rc = opal_pci_map_pe_dma_window(phb->opal_id, pe->pe_number,
+                                       pe->pe_number << 1, 1, __pa(addr),
+                                       tce_table_size, 0x1000);
+       if (rc) {
+               pe_err(pe, "Failed to configure 32-bit TCE table,"
+                      " err %ld\n", rc);
+               goto fail;
+       }
+
+       /* Setup linux iommu table */
+       tbl = &pe->tce32_table;
+       pnv_pci_setup_iommu_table(tbl, addr, tce_table_size, 0);
+
+       /* OPAL variant of PHB3 invalidated TCEs */
+       swinvp = of_get_property(phb->hose->dn, "ibm,opal-tce-kill", NULL);
+       if (swinvp) {
+               /* We need a couple more fields -- an address and a data
+                * to or.  Since the bus is only printed out on table free
+                * errors, and on the first pass the data will be a relative
+                * bus number, print that out instead.
+                */
+               tbl->it_busno = 0;
+               tbl->it_index = (unsigned long)ioremap(be64_to_cpup(swinvp), 8);
+               tbl->it_type = TCE_PCI_SWINV_CREATE | TCE_PCI_SWINV_FREE;
+       }
+       iommu_init_table(tbl, phb->hose->node);
+
+       return;
+fail:
+       if (pe->tce32_seg >= 0)
+               pe->tce32_seg = -1;
+       if (tce_mem)
+               __free_pages(tce_mem, get_order(tce_table_size));
+}
+
 static void pnv_ioda_setup_dma(struct pnv_phb *phb)
 {
        struct pci_controller *hose = phb->hose;
@@ -579,20 +727,49 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
                        if (segs > remaining)
                                segs = remaining;
                }
-               pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
-                       pe->dma_weight, segs);
-               pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+
+               /*
+                * For IODA2 compliant PHB3, we needn't care about the weight.
+                * The all available 32-bits DMA space will be assigned to
+                * the specific PE.
+                */
+               if (phb->type == PNV_PHB_IODA1) {
+                       pe_info(pe, "DMA weight %d, assigned %d DMA32 segments\n",
+                               pe->dma_weight, segs);
+                       pnv_pci_ioda_setup_dma_pe(phb, pe, base, segs);
+               } else {
+                       pe_info(pe, "Assign DMA32 space\n");
+                       segs = 0;
+                       pnv_pci_ioda2_setup_dma_pe(phb, pe);
+               }
+
                remaining -= segs;
                base += segs;
        }
 }
 
 #ifdef CONFIG_PCI_MSI
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
+       unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+       struct irq_chip *chip = irq_data_get_irq_chip(d);
+       struct pnv_phb *phb = container_of(chip, struct pnv_phb,
+                                          ioda.irq_chip);
+       int64_t rc;
+
+       rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+       WARN_ON_ONCE(rc);
+
+       icp_native_eoi(d);
+}
+
 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
-                                 unsigned int hwirq, unsigned int is_64,
-                                 struct msi_msg *msg)
+                                 unsigned int hwirq, unsigned int virq,
+                                 unsigned int is_64, struct msi_msg *msg)
 {
        struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
+       struct irq_data *idata;
+       struct irq_chip *ichip;
        unsigned int xive_num = hwirq - phb->msi_base;
        uint64_t addr64;
        uint32_t addr32, data;
@@ -637,6 +814,23 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
        }
        msg->data = data;
 
+       /*
+        * Change the IRQ chip for the MSI interrupts on PHB3.
+        * The corresponding IRQ chip should be populated for
+        * the first time.
+        */
+       if (phb->type == PNV_PHB_IODA2) {
+               if (!phb->ioda.irq_chip_init) {
+                       idata = irq_get_irq_data(virq);
+                       ichip = irq_data_get_irq_chip(idata);
+                       phb->ioda.irq_chip_init = 1;
+                       phb->ioda.irq_chip = *ichip;
+                       phb->ioda.irq_chip.irq_eoi = pnv_ioda2_msi_eoi;
+               }
+
+               irq_set_chip(virq, &phb->ioda.irq_chip);
+       }
+
        pr_devel("%s: %s-bit MSI on hwirq %x (xive #%d),"
                 " address=%x_%08x data=%x PE# %d\n",
                 pci_name(dev), is_64 ? "64" : "32", hwirq, xive_num,
@@ -647,7 +841,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
 
 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
 {
-       unsigned int bmap_size;
+       unsigned int count;
        const __be32 *prop = of_get_property(phb->hose->dn,
                                             "ibm,opal-msi-ranges", NULL);
        if (!prop) {
@@ -658,18 +852,17 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb)
                return;
 
        phb->msi_base = be32_to_cpup(prop);
-       phb->msi_count = be32_to_cpup(prop + 1);
-       bmap_size = BITS_TO_LONGS(phb->msi_count) * sizeof(unsigned long);
-       phb->msi_map = zalloc_maybe_bootmem(bmap_size, GFP_KERNEL);
-       if (!phb->msi_map) {
+       count = be32_to_cpup(prop + 1);
+       if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
                pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
                       phb->hose->global_number);
                return;
        }
+
        phb->msi_setup = pnv_pci_ioda_msi_setup;
        phb->msi32_support = 1;
        pr_info("  Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
-               phb->msi_count, phb->msi_base);
+               count, phb->msi_base);
 }
 #else
 static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
@@ -722,11 +915,14 @@ static void pnv_ioda_setup_pe_seg(struct pci_controller *hose,
                                index++;
                        }
                } else if (res->flags & IORESOURCE_MEM) {
+                       /* WARNING: Assumes M32 is mem region 0 in PHB. We need to
+                        * harden that algorithm when we start supporting M64
+                        */
                        region.start = res->start -
-                                      hose->pci_mem_offset -
+                                      hose->mem_offset[0] -
                                       phb->ioda.m32_pci_base;
                        region.end   = res->end -
-                                      hose->pci_mem_offset -
+                                      hose->mem_offset[0] -
                                       phb->ioda.m32_pci_base;
                        index = region.start / phb->ioda.m32_segsize;
 
@@ -852,18 +1048,25 @@ static u32 pnv_ioda_bdfn_to_pe(struct pnv_phb *phb, struct pci_bus *bus,
        return phb->ioda.pe_rmap[(bus->number << 8) | devfn];
 }
 
-void __init pnv_pci_init_ioda1_phb(struct device_node *np)
+static void pnv_pci_ioda_shutdown(struct pnv_phb *phb)
+{
+       opal_pci_reset(phb->opal_id, OPAL_PCI_IODA_TABLE_RESET,
+                      OPAL_ASSERT_RESET);
+}
+
+void __init pnv_pci_init_ioda_phb(struct device_node *np, int ioda_type)
 {
        struct pci_controller *hose;
        static int primary = 1;
        struct pnv_phb *phb;
        unsigned long size, m32map_off, iomap_off, pemap_off;
        const u64 *prop64;
+       const u32 *prop32;
        u64 phb_id;
        void *aux;
        long rc;
 
-       pr_info(" Initializing IODA OPAL PHB %s\n", np->full_name);
+       pr_info(" Initializing IODA%d OPAL PHB %s\n", ioda_type, np->full_name);
 
        prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
        if (!prop64) {
@@ -890,47 +1093,46 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
        hose->last_busno = 0xff;
        hose->private_data = phb;
        phb->opal_id = phb_id;
-       phb->type = PNV_PHB_IODA1;
+       phb->type = ioda_type;
 
        /* Detect specific models for error handling */
        if (of_device_is_compatible(np, "ibm,p7ioc-pciex"))
                phb->model = PNV_PHB_MODEL_P7IOC;
+       else if (of_device_is_compatible(np, "ibm,power8-pciex"))
+               phb->model = PNV_PHB_MODEL_PHB3;
        else
                phb->model = PNV_PHB_MODEL_UNKNOWN;
 
-       /* We parse "ranges" now since we need to deduce the register base
-        * from the IO base
-        */
+       /* Parse 32-bit and IO ranges (if any) */
        pci_process_bridge_OF_ranges(phb->hose, np, primary);
        primary = 0;
 
-       /* Magic formula from Milton */
+       /* Get registers */
        phb->regs = of_iomap(np, 0);
        if (phb->regs == NULL)
                pr_err("  Failed to map registers !\n");
 
-
-       /* XXX This is hack-a-thon. This needs to be changed so that:
-        *  - we obtain stuff like PE# etc... from device-tree
-        *  - we properly re-allocate M32 ourselves
-        *    (the OFW one isn't very good)
-        */
-
        /* Initialize more IODA stuff */
-       phb->ioda.total_pe = 128;
+       prop32 = of_get_property(np, "ibm,opal-num-pes", NULL);
+       if (!prop32)
+               phb->ioda.total_pe = 1;
+       else
+               phb->ioda.total_pe = *prop32;
 
        phb->ioda.m32_size = resource_size(&hose->mem_resources[0]);
-       /* OFW Has already off top 64k of M32 space (MSI space) */
+       /* FW Has already off top 64k of M32 space (MSI space) */
        phb->ioda.m32_size += 0x10000;
 
        phb->ioda.m32_segsize = phb->ioda.m32_size / phb->ioda.total_pe;
-       phb->ioda.m32_pci_base = hose->mem_resources[0].start -
-               hose->pci_mem_offset;
+       phb->ioda.m32_pci_base = hose->mem_resources[0].start - hose->mem_offset[0];
        phb->ioda.io_size = hose->pci_io_size;
        phb->ioda.io_segsize = phb->ioda.io_size / phb->ioda.total_pe;
        phb->ioda.io_pci_base = 0; /* XXX calculate this ? */
 
-       /* Allocate aux data & arrays */
+       /* Allocate aux data & arrays
+        *
+        * XXX TODO: Don't allocate io segmap on PHB3
+        */
        size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
        m32map_off = size;
        size += phb->ioda.total_pe * sizeof(phb->ioda.m32_segmap[0]);
@@ -960,7 +1162,7 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
        hose->mem_resources[2].start = 0;
        hose->mem_resources[2].end = 0;
 
-#if 0
+#if 0 /* We should really do that ... */
        rc = opal_pci_set_phb_mem_window(opal->phb_id,
                                         window_type,
                                         window_num,
@@ -974,16 +1176,6 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
                phb->ioda.m32_size, phb->ioda.m32_segsize,
                phb->ioda.io_size, phb->ioda.io_segsize);
 
-       if (phb->regs)  {
-               pr_devel(" BUID     = 0x%016llx\n", in_be64(phb->regs + 0x100));
-               pr_devel(" PHB2_CR  = 0x%016llx\n", in_be64(phb->regs + 0x160));
-               pr_devel(" IO_BAR   = 0x%016llx\n", in_be64(phb->regs + 0x170));
-               pr_devel(" IO_BAMR  = 0x%016llx\n", in_be64(phb->regs + 0x178));
-               pr_devel(" IO_SAR   = 0x%016llx\n", in_be64(phb->regs + 0x180));
-               pr_devel(" M32_BAR  = 0x%016llx\n", in_be64(phb->regs + 0x190));
-               pr_devel(" M32_BAMR = 0x%016llx\n", in_be64(phb->regs + 0x198));
-               pr_devel(" M32_SAR  = 0x%016llx\n", in_be64(phb->regs + 0x1a0));
-       }
        phb->hose->ops = &pnv_pci_ops;
 
        /* Setup RID -> PE mapping function */
@@ -992,6 +1184,9 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
        /* Setup TCEs */
        phb->dma_dev_setup = pnv_pci_ioda_dma_dev_setup;
 
+       /* Setup shutdown function for kexec */
+       phb->shutdown = pnv_pci_ioda_shutdown;
+
        /* Setup MSI support */
        pnv_pci_init_ioda_msis(phb);
 
@@ -1011,7 +1206,18 @@ void __init pnv_pci_init_ioda1_phb(struct device_node *np)
        rc = opal_pci_reset(phb_id, OPAL_PCI_IODA_TABLE_RESET, OPAL_ASSERT_RESET);
        if (rc)
                pr_warning("  OPAL Error %ld performing IODA table reset !\n", rc);
-       opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE);
+
+       /*
+        * On IODA1 map everything to PE#0, on IODA2 we assume the IODA reset
+        * has cleared the RTT which has the same effect
+        */
+       if (ioda_type == PNV_PHB_IODA1)
+               opal_pci_set_pe(phb_id, 0, 0, 7, 1, 1 , OPAL_MAP_PE);
+}
+
+void pnv_pci_init_ioda2_phb(struct device_node *np)
+{
+       pnv_pci_init_ioda_phb(np, PNV_PHB_IODA2);
 }
 
 void __init pnv_pci_init_ioda_hub(struct device_node *np)
@@ -1034,6 +1240,6 @@ void __init pnv_pci_init_ioda_hub(struct device_node *np)
        for_each_child_of_node(np, phbn) {
                /* Look for IODA1 PHBs */
                if (of_device_is_compatible(phbn, "ibm,ioda-phb"))
-                       pnv_pci_init_ioda1_phb(phbn);
+                       pnv_pci_init_ioda_phb(phbn, PNV_PHB_IODA1);
        }
 }