linux-zen-server/drivers/gpu/drm/i915/gt/gen6_engine_cs.c

// SPDX-License-Identifier: MIT
/*
 * Copyright © 2020 Intel Corporation
 */

#include "gen6_engine_cs.h"
#include "intel_engine.h"
#include "intel_engine_regs.h"
#include "intel_gpu_commands.h"
#include "intel_gt.h"
#include "intel_gt_irq.h"
#include "intel_gt_pm_irq.h"
#include "intel_ring.h"

#define HWS_SCRATCH_ADDR	(I915_GEM_HWS_SCRATCH * sizeof(u32))

/*
 * Emits a PIPE_CONTROL with a non-zero post-sync operation, for
 * implementing two workarounds on gen6.  From section 1.4.7.1
 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
 *
 * [DevSNB-C+{W/A}] Before any depth stall flush (including those
 * produced by non-pipelined state commands), software needs to first
 * send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
 * 0.
 *
 * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
 * =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
 *
 * And the workaround for these two requires this workaround first:
 *
 * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
 * BEFORE the pipe-control with a post-sync op and no write-cache
 * flushes.
 *
 * And this last workaround is tricky because of the requirements on
 * that bit.  From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
 * volume 2 part 1:
 *
 *     "1 of the following must also be set:
 *      - Render Target Cache Flush Enable ([12] of DW1)
 *      - Depth Cache Flush Enable ([0] of DW1)
 *      - Stall at Pixel Scoreboard ([1] of DW1)
 *      - Depth Stall ([13] of DW1)
 *      - Post-Sync Operation ([13] of DW1)
 *      - Notify Enable ([8] of DW1)"
 *
 * The cache flushes require the workaround flush that triggered this
 * one, so we can't use it.  Depth stall would trigger the same.
 * Post-sync nonzero is what triggered this second workaround, so we
 * can't use that one either.  Notify enable is IRQs, which aren't
 * really our business.  That leaves only stall at scoreboard.
 */
static int
gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)
{
	u32 scratch_addr =
		intel_gt_scratch_offset(rq->engine->gt,
					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
	u32 *cs;

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(5);
	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = 0; /* low dword */
	*cs++ = 0; /* high dword */
	*cs++ = MI_NOOP;
	intel_ring_advance(rq, cs);

	cs = intel_ring_begin(rq, 6);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(5);
	*cs++ = PIPE_CONTROL_QW_WRITE;
	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = 0;
	*cs++ = 0;
	*cs++ = MI_NOOP;
	intel_ring_advance(rq, cs);

	return 0;
}

int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)
{
	u32 scratch_addr =
		intel_gt_scratch_offset(rq->engine->gt,
					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
	u32 *cs, flags = 0;
	int ret;

	/* Force SNB workarounds for PIPE_CONTROL flushes */
	ret = gen6_emit_post_sync_nonzero_flush(rq);
	if (ret)
		return ret;

	/*
	 * Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact. And when rearranging requests, the order of flushes is
	 * unknown.
	 */
	if (mode & EMIT_FLUSH) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		/*
		 * Ensure that any following seqno writes only happen
		 * when the render cache is indeed flushed.
		 */
		flags |= PIPE_CONTROL_CS_STALL;
	}
	if (mode & EMIT_INVALIDATE) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		/*
		 * TLB invalidate requires a post-sync write.
		 */
		flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
	}

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = flags;
	*cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = 0;
	intel_ring_advance(rq, cs);

	return 0;
}

u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
{
	/* First we do the gen6_emit_post_sync_nonzero_flush w/a */
	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
	*cs++ = 0;
	*cs++ = 0;

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_QW_WRITE;
	*cs++ = intel_gt_scratch_offset(rq->engine->gt,
					INTEL_GT_SCRATCH_FIELD_DEFAULT) |
		PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = 0;

	/* Finally we can flush and with it emit the breadcrumb */
	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
		 PIPE_CONTROL_DC_FLUSH_ENABLE |
		 PIPE_CONTROL_QW_WRITE |
		 PIPE_CONTROL_CS_STALL);
	*cs++ = i915_request_active_seqno(rq) |
		PIPE_CONTROL_GLOBAL_GTT;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
}

static int mi_flush_dw(struct i915_request *rq, u32 flags)
{
	u32 cmd, *cs;

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	cmd = MI_FLUSH_DW;

	/*
	 * We always require a command barrier so that subsequent
	 * commands, such as breadcrumb interrupts, are strictly ordered
	 * wrt the contents of the write cache being flushed to memory
	 * (and thus being coherent from the CPU).
	 */
	cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;

	/*
	 * Bspec vol 1c.3 - blitter engine command streamer:
	 * "If ENABLED, all TLBs will be invalidated once the flush
	 * operation is complete. This bit is only valid when the
	 * Post-Sync Operation field is a value of 1h or 3h."
	 */
	cmd |= flags;

	*cs++ = cmd;
	*cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = 0;
	*cs++ = MI_NOOP;

	intel_ring_advance(rq, cs);

	return 0;
}

static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)
{
	return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);
}

int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)
{
	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);
}

int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)
{
	return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD);
}

int gen6_emit_bb_start(struct i915_request *rq,
		       u64 offset, u32 len,
		       unsigned int dispatch_flags)
{
	u32 security;
	u32 *cs;

	security = MI_BATCH_NON_SECURE_I965;
	if (dispatch_flags & I915_DISPATCH_SECURE)
		security = 0;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	cs = __gen6_emit_bb_start(cs, offset, security);
	intel_ring_advance(rq, cs);

	return 0;
}

int
hsw_emit_bb_start(struct i915_request *rq,
		  u64 offset, u32 len,
		  unsigned int dispatch_flags)
{
	u32 security;
	u32 *cs;

	security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW;
	if (dispatch_flags & I915_DISPATCH_SECURE)
		security = 0;

	cs = intel_ring_begin(rq, 2);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	cs = __gen6_emit_bb_start(cs, offset, security);
	intel_ring_advance(rq, cs);

	return 0;
}

static int gen7_stall_cs(struct i915_request *rq)
{
	u32 *cs;

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD;
	*cs++ = 0;
	*cs++ = 0;
	intel_ring_advance(rq, cs);

	return 0;
}

int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)
{
	u32 scratch_addr =
		intel_gt_scratch_offset(rq->engine->gt,
					INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);
	u32 *cs, flags = 0;

	/*
	 * Ensure that any following seqno writes only happen when the render
	 * cache is indeed flushed.
	 *
	 * Workaround: 4th PIPE_CONTROL command (except the ones with only
	 * read-cache invalidate bits set) must have the CS_STALL bit set. We
	 * don't try to be clever and just set it unconditionally.
	 */
	flags |= PIPE_CONTROL_CS_STALL;

	/*
	 * CS_STALL suggests at least a post-sync write.
	 */
	flags |= PIPE_CONTROL_QW_WRITE;
	flags |= PIPE_CONTROL_GLOBAL_GTT_IVB;

	/*
	 * Just flush everything.  Experiments have shown that reducing the
	 * number of bits based on the write domains has little performance
	 * impact.
	 */
	if (mode & EMIT_FLUSH) {
		flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
		flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
		flags |= PIPE_CONTROL_FLUSH_ENABLE;
	}
	if (mode & EMIT_INVALIDATE) {
		flags |= PIPE_CONTROL_TLB_INVALIDATE;
		flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR;

		/*
		 * Workaround: we must issue a pipe_control with CS-stall bit
		 * set before a pipe_control command that has the state cache
		 * invalidate bit set.
		 */
		gen7_stall_cs(rq);
	}

	cs = intel_ring_begin(rq, 4);
	if (IS_ERR(cs))
		return PTR_ERR(cs);

	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = flags;
	*cs++ = scratch_addr;
	*cs++ = 0;
	intel_ring_advance(rq, cs);

	return 0;
}

u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs)
{
	*cs++ = GFX_OP_PIPE_CONTROL(4);
	*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
		 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
		 PIPE_CONTROL_DC_FLUSH_ENABLE |
		 PIPE_CONTROL_FLUSH_ENABLE |
		 PIPE_CONTROL_QW_WRITE |
		 PIPE_CONTROL_GLOBAL_GTT_IVB |
		 PIPE_CONTROL_CS_STALL);
	*cs++ = i915_request_active_seqno(rq);
	*cs++ = rq->fence.seqno;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
}

u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
{
	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);

	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->fence.seqno;

	*cs++ = MI_USER_INTERRUPT;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
}

#define GEN7_XCS_WA 32
u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs)
{
	int i;

	GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);
	GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);

	*cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB |
		MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
	*cs++ = rq->fence.seqno;

	for (i = 0; i < GEN7_XCS_WA; i++) {
		*cs++ = MI_STORE_DWORD_INDEX;
		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
		*cs++ = rq->fence.seqno;
	}

	*cs++ = MI_FLUSH_DW;
	*cs++ = 0;
	*cs++ = 0;

	*cs++ = MI_USER_INTERRUPT;
	*cs++ = MI_NOOP;

	rq->tail = intel_ring_offset(rq, cs);
	assert_ring_tail_valid(rq->ring, rq->tail);

	return cs;
}
#undef GEN7_XCS_WA

void gen6_irq_enable(struct intel_engine_cs *engine)
{
	ENGINE_WRITE(engine, RING_IMR,
		     ~(engine->irq_enable_mask | engine->irq_keep_mask));

	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
	ENGINE_POSTING_READ(engine, RING_IMR);

	gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);
}

void gen6_irq_disable(struct intel_engine_cs *engine)
{
	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
	gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);
}

void hsw_irq_enable_vecs(struct intel_engine_cs *engine)
{
	ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);

	/* Flush/delay to ensure the RING_IMR is active before the GT IMR */
	ENGINE_POSTING_READ(engine, RING_IMR);

	gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);
}

void hsw_irq_disable_vecs(struct intel_engine_cs *engine)
{
	ENGINE_WRITE(engine, RING_IMR, ~0);
	gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);
}
Initial commit 2023-08-30 17:53:23 +02:00			`// SPDX-License-Identifier: MIT`
			`/*`
			`* Copyright © 2020 Intel Corporation`
			`*/`

			`#include "gen6_engine_cs.h"`
			`#include "intel_engine.h"`
			`#include "intel_engine_regs.h"`
			`#include "intel_gpu_commands.h"`
			`#include "intel_gt.h"`
			`#include "intel_gt_irq.h"`
			`#include "intel_gt_pm_irq.h"`
			`#include "intel_ring.h"`

			`#define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32))`

			`/*`
			`* Emits a PIPE_CONTROL with a non-zero post-sync operation, for`
			`* implementing two workarounds on gen6. From section 1.4.7.1`
			`* "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:`
			`*`
			`* [DevSNB-C+{W/A}] Before any depth stall flush (including those`
			`* produced by non-pipelined state commands), software needs to first`
			`* send a PIPE_CONTROL with no bits set except Post-Sync Operation !=`
			`* 0.`
			`*`
			`* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable`
			`* =1, a PIPE_CONTROL with any non-zero post-sync-op is required.`
			`*`
			`* And the workaround for these two requires this workaround first:`
			`*`
			`* [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent`
			`* BEFORE the pipe-control with a post-sync op and no write-cache`
			`* flushes.`
			`*`
			`* And this last workaround is tricky because of the requirements on`
			`* that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM`
			`* volume 2 part 1:`
			`*`
			`* "1 of the following must also be set:`
			`* - Render Target Cache Flush Enable ([12] of DW1)`
			`* - Depth Cache Flush Enable ([0] of DW1)`
			`* - Stall at Pixel Scoreboard ([1] of DW1)`
			`* - Depth Stall ([13] of DW1)`
			`* - Post-Sync Operation ([13] of DW1)`
			`* - Notify Enable ([8] of DW1)"`
			`*`
			`* The cache flushes require the workaround flush that triggered this`
			`* one, so we can't use it. Depth stall would trigger the same.`
			`* Post-sync nonzero is what triggered this second workaround, so we`
			`* can't use that one either. Notify enable is IRQs, which aren't`
			`* really our business. That leaves only stall at scoreboard.`
			`*/`
			`static int`
			`gen6_emit_post_sync_nonzero_flush(struct i915_request *rq)`
			`{`
			`u32 scratch_addr =`
			`intel_gt_scratch_offset(rq->engine->gt,`
			`INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);`
			`u32 *cs;`

			`cs = intel_ring_begin(rq, 6);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = GFX_OP_PIPE_CONTROL(5);`
			`*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;`
			`*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;`
			`cs++ = 0; / low dword */`
			`cs++ = 0; / high dword */`
			`*cs++ = MI_NOOP;`
			`intel_ring_advance(rq, cs);`

			`cs = intel_ring_begin(rq, 6);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = GFX_OP_PIPE_CONTROL(5);`
			`*cs++ = PIPE_CONTROL_QW_WRITE;`
			`*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;`
			`*cs++ = 0;`
			`*cs++ = 0;`
			`*cs++ = MI_NOOP;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode)`
			`{`
			`u32 scratch_addr =`
			`intel_gt_scratch_offset(rq->engine->gt,`
			`INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);`
			`u32 *cs, flags = 0;`
			`int ret;`

			`/* Force SNB workarounds for PIPE_CONTROL flushes */`
			`ret = gen6_emit_post_sync_nonzero_flush(rq);`
			`if (ret)`
			`return ret;`

			`/*`
			`* Just flush everything. Experiments have shown that reducing the`
			`* number of bits based on the write domains has little performance`
			`* impact. And when rearranging requests, the order of flushes is`
			`* unknown.`
			`*/`
			`if (mode & EMIT_FLUSH) {`
			`flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;`
			`flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;`
			`/*`
			`* Ensure that any following seqno writes only happen`
			`* when the render cache is indeed flushed.`
			`*/`
			`flags \|= PIPE_CONTROL_CS_STALL;`
			`}`
			`if (mode & EMIT_INVALIDATE) {`
			`flags \|= PIPE_CONTROL_TLB_INVALIDATE;`
			`flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;`
			`/*`
			`* TLB invalidate requires a post-sync write.`
			`*/`
			`flags \|= PIPE_CONTROL_QW_WRITE \| PIPE_CONTROL_CS_STALL;`
			`}`

			`cs = intel_ring_begin(rq, 4);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = GFX_OP_PIPE_CONTROL(4);`
			`*cs++ = flags;`
			`*cs++ = scratch_addr \| PIPE_CONTROL_GLOBAL_GTT;`
			`*cs++ = 0;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`u32 gen6_emit_breadcrumb_rcs(struct i915_request rq, u32 *cs)`
			`{`
			`/* First we do the gen6_emit_post_sync_nonzero_flush w/a */`
			`*cs++ = GFX_OP_PIPE_CONTROL(4);`
			`*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;`
			`*cs++ = 0;`
			`*cs++ = 0;`

			`*cs++ = GFX_OP_PIPE_CONTROL(4);`
			`*cs++ = PIPE_CONTROL_QW_WRITE;`
			`*cs++ = intel_gt_scratch_offset(rq->engine->gt,`
			`INTEL_GT_SCRATCH_FIELD_DEFAULT) \|`
			`PIPE_CONTROL_GLOBAL_GTT;`
			`*cs++ = 0;`

			`/* Finally we can flush and with it emit the breadcrumb */`
			`*cs++ = GFX_OP_PIPE_CONTROL(4);`
			`*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|`
			`PIPE_CONTROL_DEPTH_CACHE_FLUSH \|`
			`PIPE_CONTROL_DC_FLUSH_ENABLE \|`
			`PIPE_CONTROL_QW_WRITE \|`
			`PIPE_CONTROL_CS_STALL);`
			`*cs++ = i915_request_active_seqno(rq) \|`
			`PIPE_CONTROL_GLOBAL_GTT;`
			`*cs++ = rq->fence.seqno;`

			`*cs++ = MI_USER_INTERRUPT;`
			`*cs++ = MI_NOOP;`

			`rq->tail = intel_ring_offset(rq, cs);`
			`assert_ring_tail_valid(rq->ring, rq->tail);`

			`return cs;`
			`}`

			`static int mi_flush_dw(struct i915_request *rq, u32 flags)`
			`{`
			`u32 cmd, *cs;`

			`cs = intel_ring_begin(rq, 4);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`cmd = MI_FLUSH_DW;`

			`/*`
			`* We always require a command barrier so that subsequent`
			`* commands, such as breadcrumb interrupts, are strictly ordered`
			`* wrt the contents of the write cache being flushed to memory`
			`* (and thus being coherent from the CPU).`
			`*/`
			`cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;`

			`/*`
			`* Bspec vol 1c.3 - blitter engine command streamer:`
			`* "If ENABLED, all TLBs will be invalidated once the flush`
			`* operation is complete. This bit is only valid when the`
			`* Post-Sync Operation field is a value of 1h or 3h."`
			`*/`
			`cmd \|= flags;`

			`*cs++ = cmd;`
			`*cs++ = HWS_SCRATCH_ADDR \| MI_FLUSH_DW_USE_GTT;`
			`*cs++ = 0;`
			`*cs++ = MI_NOOP;`

			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags)`
			`{`
			`return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0);`
			`}`

			`int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode)`
			`{`
			`return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB);`
			`}`

			`int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode)`
			`{`
			`return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB \| MI_INVALIDATE_BSD);`
			`}`

			`int gen6_emit_bb_start(struct i915_request *rq,`
			`u64 offset, u32 len,`
			`unsigned int dispatch_flags)`
			`{`
			`u32 security;`
			`u32 *cs;`

			`security = MI_BATCH_NON_SECURE_I965;`
			`if (dispatch_flags & I915_DISPATCH_SECURE)`
			`security = 0;`

			`cs = intel_ring_begin(rq, 2);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`cs = __gen6_emit_bb_start(cs, offset, security);`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`int`
			`hsw_emit_bb_start(struct i915_request *rq,`
			`u64 offset, u32 len,`
			`unsigned int dispatch_flags)`
			`{`
			`u32 security;`
			`u32 *cs;`

			`security = MI_BATCH_PPGTT_HSW \| MI_BATCH_NON_SECURE_HSW;`
			`if (dispatch_flags & I915_DISPATCH_SECURE)`
			`security = 0;`

			`cs = intel_ring_begin(rq, 2);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`cs = __gen6_emit_bb_start(cs, offset, security);`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`static int gen7_stall_cs(struct i915_request *rq)`
			`{`
			`u32 *cs;`

			`cs = intel_ring_begin(rq, 4);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = GFX_OP_PIPE_CONTROL(4);`
			`*cs++ = PIPE_CONTROL_CS_STALL \| PIPE_CONTROL_STALL_AT_SCOREBOARD;`
			`*cs++ = 0;`
			`*cs++ = 0;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode)`
			`{`
			`u32 scratch_addr =`
			`intel_gt_scratch_offset(rq->engine->gt,`
			`INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH);`
			`u32 *cs, flags = 0;`

			`/*`
			`* Ensure that any following seqno writes only happen when the render`
			`* cache is indeed flushed.`
			`*`
			`* Workaround: 4th PIPE_CONTROL command (except the ones with only`
			`* read-cache invalidate bits set) must have the CS_STALL bit set. We`
			`* don't try to be clever and just set it unconditionally.`
			`*/`
			`flags \|= PIPE_CONTROL_CS_STALL;`

			`/*`
			`* CS_STALL suggests at least a post-sync write.`
			`*/`
			`flags \|= PIPE_CONTROL_QW_WRITE;`
			`flags \|= PIPE_CONTROL_GLOBAL_GTT_IVB;`

			`/*`
			`* Just flush everything. Experiments have shown that reducing the`
			`* number of bits based on the write domains has little performance`
			`* impact.`
			`*/`
			`if (mode & EMIT_FLUSH) {`
			`flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;`
			`flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;`
			`flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;`
			`flags \|= PIPE_CONTROL_FLUSH_ENABLE;`
			`}`
			`if (mode & EMIT_INVALIDATE) {`
			`flags \|= PIPE_CONTROL_TLB_INVALIDATE;`
			`flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;`
			`flags \|= PIPE_CONTROL_MEDIA_STATE_CLEAR;`

			`/*`
			`* Workaround: we must issue a pipe_control with CS-stall bit`
			`* set before a pipe_control command that has the state cache`
			`* invalidate bit set.`
			`*/`
			`gen7_stall_cs(rq);`
			`}`

			`cs = intel_ring_begin(rq, 4);`
			`if (IS_ERR(cs))`
			`return PTR_ERR(cs);`

			`*cs++ = GFX_OP_PIPE_CONTROL(4);`
			`*cs++ = flags;`
			`*cs++ = scratch_addr;`
			`*cs++ = 0;`
			`intel_ring_advance(rq, cs);`

			`return 0;`
			`}`

			`u32 gen7_emit_breadcrumb_rcs(struct i915_request rq, u32 *cs)`
			`{`
			`*cs++ = GFX_OP_PIPE_CONTROL(4);`
			`*cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH \|`
			`PIPE_CONTROL_DEPTH_CACHE_FLUSH \|`
			`PIPE_CONTROL_DC_FLUSH_ENABLE \|`
			`PIPE_CONTROL_FLUSH_ENABLE \|`
			`PIPE_CONTROL_QW_WRITE \|`
			`PIPE_CONTROL_GLOBAL_GTT_IVB \|`
			`PIPE_CONTROL_CS_STALL);`
			`*cs++ = i915_request_active_seqno(rq);`
			`*cs++ = rq->fence.seqno;`

			`*cs++ = MI_USER_INTERRUPT;`
			`*cs++ = MI_NOOP;`

			`rq->tail = intel_ring_offset(rq, cs);`
			`assert_ring_tail_valid(rq->ring, rq->tail);`

			`return cs;`
			`}`

			`u32 gen6_emit_breadcrumb_xcs(struct i915_request rq, u32 *cs)`
			`{`
			`GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);`
			`GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);`

			`*cs++ = MI_FLUSH_DW \| MI_FLUSH_DW_OP_STOREDW \| MI_FLUSH_DW_STORE_INDEX;`
			`*cs++ = I915_GEM_HWS_SEQNO_ADDR \| MI_FLUSH_DW_USE_GTT;`
			`*cs++ = rq->fence.seqno;`

			`*cs++ = MI_USER_INTERRUPT;`

			`rq->tail = intel_ring_offset(rq, cs);`
			`assert_ring_tail_valid(rq->ring, rq->tail);`

			`return cs;`
			`}`

			`#define GEN7_XCS_WA 32`
			`u32 gen7_emit_breadcrumb_xcs(struct i915_request rq, u32 *cs)`
			`{`
			`int i;`

			`GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma);`
			`GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR);`

			`*cs++ = MI_FLUSH_DW \| MI_INVALIDATE_TLB \|`
			`MI_FLUSH_DW_OP_STOREDW \| MI_FLUSH_DW_STORE_INDEX;`
			`*cs++ = I915_GEM_HWS_SEQNO_ADDR \| MI_FLUSH_DW_USE_GTT;`
			`*cs++ = rq->fence.seqno;`

			`for (i = 0; i < GEN7_XCS_WA; i++) {`
			`*cs++ = MI_STORE_DWORD_INDEX;`
			`*cs++ = I915_GEM_HWS_SEQNO_ADDR;`
			`*cs++ = rq->fence.seqno;`
			`}`

			`*cs++ = MI_FLUSH_DW;`
			`*cs++ = 0;`
			`*cs++ = 0;`

			`*cs++ = MI_USER_INTERRUPT;`
			`*cs++ = MI_NOOP;`

			`rq->tail = intel_ring_offset(rq, cs);`
			`assert_ring_tail_valid(rq->ring, rq->tail);`

			`return cs;`
			`}`
			`#undef GEN7_XCS_WA`

			`void gen6_irq_enable(struct intel_engine_cs *engine)`
			`{`
			`ENGINE_WRITE(engine, RING_IMR,`
			`~(engine->irq_enable_mask \| engine->irq_keep_mask));`

			`/* Flush/delay to ensure the RING_IMR is active before the GT IMR */`
			`ENGINE_POSTING_READ(engine, RING_IMR);`

			`gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask);`
			`}`

			`void gen6_irq_disable(struct intel_engine_cs *engine)`
			`{`
			`ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);`
			`gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask);`
			`}`

			`void hsw_irq_enable_vecs(struct intel_engine_cs *engine)`
			`{`
			`ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask);`

			`/* Flush/delay to ensure the RING_IMR is active before the GT IMR */`
			`ENGINE_POSTING_READ(engine, RING_IMR);`

			`gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask);`
			`}`

			`void hsw_irq_disable_vecs(struct intel_engine_cs *engine)`
			`{`
			`ENGINE_WRITE(engine, RING_IMR, ~0);`
			`gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask);`
			`}`