一般我们通过dma_alloc_coherent 和 dma_free_coherent 来申请和释放coherent memory。申请coherent memory的代价比较大,推荐只有在要申请的memory大于1个page是才推荐用,否则推荐用dma_pool 来申请.
static inline void *dma_alloc_coherent(struct device *dev, size_t size,
        dma_addr_t *dma_handle, gfp_t flag)
{
    return dma_alloc_attrs(dev, size, dma_handle, flag, 0);
}
dma_alloc_coherent 直接调用dma_alloc_attrs 并把attrs设为0
#ifndef arch_dma_alloc_attrs
#define arch_dma_alloc_attrs(dev, flag)    (true)
#endif

static inline void *dma_alloc_attrs(struct device *dev, size_t size,
                       dma_addr_t *dma_handle, gfp_t flag,
                       unsigned long attrs)
{
    struct dma_map_ops *ops = get_dma_ops(dev);
    void *cpu_addr;

    BUG_ON(!ops);

    if (dma_alloc_from_coherent(dev, size, dma_handle, &cpu_addr))
        return cpu_addr;

    if (!arch_dma_alloc_attrs(&dev, &flag))
        return NULL;
    if (!ops->alloc)
        return NULL;

    cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs);
    debug_dma_alloc_coherent(dev, size, *dma_handle, cpu_addr);
    return cpu_addr;
}
在dma_alloc_attrs 首先通过dma_alloc_from_coherent 从device自己的dma memory中申请,如果没有再通过ops->alloc 申请,需要注意的是arch_dma_alloc_attrs 是写死返回true的,因此在时能iommu的情况下通过struct dma_map_ops *ops = get_dma_ops(dev); 得到的就是iommu的ops
最后调用cpu_addr = ops->alloc(dev, size, dma_handle, flag, attrs); 来通过iommu申请memory。
前面的博文讲过iommu_dma_ops的定义如下
static struct dma_map_ops iommu_dma_ops = {
    .alloc = __iommu_alloc_attrs,
    .free = __iommu_free_attrs,
    .mmap = __iommu_mmap_attrs,
    .get_sgtable = __iommu_get_sgtable,
    .map_page = __iommu_map_page,
    .unmap_page = __iommu_unmap_page,
    .map_sg = __iommu_map_sg_attrs,
    .unmap_sg = __iommu_unmap_sg_attrs,
    .sync_single_for_cpu = __iommu_sync_single_for_cpu,
    .sync_single_for_device = __iommu_sync_single_for_device,
    .sync_sg_for_cpu = __iommu_sync_sg_for_cpu,
    .sync_sg_for_device = __iommu_sync_sg_for_device,
    .dma_supported = iommu_dma_supported,
    .mapping_error = iommu_dma_mapping_error,
};

因此这里调用的是__iommu_alloc_attrs
static void *__iommu_alloc_attrs(struct device *dev, size_t size,
                 dma_addr_t *handle, gfp_t gfp,
                 unsigned long attrs)
{
    bool coherent = is_device_dma_coherent(dev);
    int ioprot = dma_direction_to_prot(DMA_BIDIRECTIONAL, coherent);
    size_t iosize = size;
    void *addr;

    if (WARN(!dev, "cannot create IOMMU mapping for unknown device\n"))
        return NULL;

    size = PAGE_ALIGN(size);

    /*
     * Some drivers rely on this, and we probably don't want the
     * possibility of stale kernel data being read by devices anyway.
     */
    gfp |= __GFP_ZERO;

    if (gfpflags_allow_blocking(gfp)) {
        struct page **pages;
        pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);

        pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
                    handle, flush_page);
        if (!pages)
            return NULL;

        addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
                          __builtin_return_address(0));
        if (!addr)
            iommu_dma_free(dev, pages, iosize, handle);
    } else {
        struct page *page;
        /*
         * In atomic context we can't remap anything, so we'll only
         * get the virtually contiguous buffer we need by way of a
         * physically contiguous allocation.
         */
        if (coherent) {
            page = alloc_pages(gfp, get_order(size));
            addr = page ? page_address(page) : NULL;
        } else {
            addr = __alloc_from_pool(size, &page, gfp);
        }
        if (!addr)
            return NULL;

        *handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot);
        if (iommu_dma_mapping_error(dev, *handle)) {
            if (coherent)
                __free_pages(page, get_order(size));
            else
                __free_from_pool(addr, size);
            addr = NULL;
        }
    }
    return addr;
}
在__iommu_alloc_attrs 中首先将要申请的size page对其
    size = PAGE_ALIGN(size);
这也证明之前说的用dma_alloc_coherent 申请的memory最好是大于1个page的,否则就是巨大的浪费.
这里有一个细节
    gfp |= __GFP_ZERO;

也就是说通过iommu申请的memory会自动清零
这里会通过判断__GFP_DIRECT_RECLAIM 来走不通的case,不过最终都是通过alloc_pages 来申请的,因此我们专注else的case
page = alloc_pages(gfp, get_order(size));
addr = page ? page_address(page) : NULL;
这里通过alloc_pages申请后,就会通过iommu_dma_map_page来映射,这才是iommu的核心工作
iommu_dma_map_page->iommu_map
int iommu_map(struct iommu_domain *domain, unsigned long iova,
          phys_addr_t paddr, size_t size, int prot)
{
        ret = domain->ops->map(domain, iova, paddr, pgsize, prot);
}
static struct iommu_ops arm_smmu_ops = {
    .capable        = arm_smmu_capable,
    .domain_alloc        = arm_smmu_domain_alloc,
    .domain_free        = arm_smmu_domain_free,
    .attach_dev        = arm_smmu_attach_dev,
    .map            = arm_smmu_map,
    .unmap            = arm_smmu_unmap,
    .map_sg            = default_iommu_map_sg,
    .iova_to_phys        = arm_smmu_iova_to_phys,
    .add_device        = arm_smmu_add_device,
    .remove_device        = arm_smmu_remove_device,
    .device_group        = arm_smmu_device_group,
    .domain_get_attr    = arm_smmu_domain_get_attr,
    .domain_set_attr    = arm_smmu_domain_set_attr,
    .of_xlate        = arm_smmu_of_xlate,
    .pgsize_bitmap        = -1UL, /* Restricted during device attach */
};

这个arm_smmu_ops 中的arm_smmu_map函数
static int arm_smmu_map(struct iommu_domain *domain, unsigned long iova,
            phys_addr_t paddr, size_t size, int prot)
{
    int ret;
    unsigned long flags;
    struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
    struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;

    if (!ops)
        return -ENODEV;

    spin_lock_irqsave(&smmu_domain->pgtbl_lock, flags);
    ret = ops->map(ops, iova, paddr, size, prot);
    spin_unlock_irqrestore(&smmu_domain->pgtbl_lock, flags);
    return ret;
}
在arm_smmu_map 函数中调用ops->map(ops, iova, paddr, size, prot); 而ops = smmu_domain->pgtbl_ops; 而pgtbl_ops是在arm_smmu_domain_finalise 中赋值
static int arm_smmu_domain_finalise(struct iommu_domain *domain)
{
    pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
    if (!pgtbl_ops)
        return -ENOMEM;
}
static const struct io_pgtable_init_fns *
io_pgtable_init_table[IO_PGTABLE_NUM_FMTS] = {
#ifdef CONFIG_IOMMU_IO_PGTABLE_LPAE
    [ARM_32_LPAE_S1] = &io_pgtable_arm_32_lpae_s1_init_fns,
    [ARM_32_LPAE_S2] = &io_pgtable_arm_32_lpae_s2_init_fns,
    [ARM_64_LPAE_S1] = &io_pgtable_arm_64_lpae_s1_init_fns,
    [ARM_64_LPAE_S2] = &io_pgtable_arm_64_lpae_s2_init_fns,
#endif
#ifdef CONFIG_IOMMU_IO_PGTABLE_ARMV7S
    [ARM_V7S] = &io_pgtable_arm_v7s_init_fns,
#endif
};

struct io_pgtable_ops *alloc_io_pgtable_ops(enum io_pgtable_fmt fmt,
                        struct io_pgtable_cfg *cfg,
                        void *cookie)
{
    struct io_pgtable *iop;
    const struct io_pgtable_init_fns *fns;

    if (fmt >= IO_PGTABLE_NUM_FMTS)
        return NULL;

    fns = io_pgtable_init_table[fmt];
    if (!fns)
        return NULL;

    iop = fns->alloc(cfg, cookie);
    if (!iop)
        return NULL;

    iop->fmt    = fmt;
    iop->cookie    = cookie;
    iop->cfg    = *cfg;

    return &iop->ops;
}
这边以ARM_64_LPAE_S2 为例
struct io_pgtable_init_fns io_pgtable_arm_64_lpae_s1_init_fns = {
    .alloc    = arm_64_lpae_alloc_pgtable_s1,
    .free    = arm_lpae_free_pgtable,
};
arm_64_lpae_alloc_pgtable_s1->arm_lpae_alloc_pgtable
    data->iop.ops = (struct io_pgtable_ops) {
        .map        = arm_lpae_map,
        .unmap        = arm_lpae_unmap,
        .iova_to_phys    = arm_lpae_iova_to_phys,
    };
可见最终是调用arm_lpae_map来map的.

更多推荐

DMA Coherent Mapping