linux-zen-server/tools/include/linux/ring_buffer.h

#ifndef _TOOLS_LINUX_RING_BUFFER_H_
#define _TOOLS_LINUX_RING_BUFFER_H_

#include <asm/barrier.h>
#include <linux/perf_event.h>

/*
 * Contract with kernel for walking the perf ring buffer from
 * user space requires the following barrier pairing (quote
 * from kernel/events/ring_buffer.c):
 *
 *   Since the mmap() consumer (userspace) can run on a
 *   different CPU:
 *
 *   kernel                             user
 *
 *   if (LOAD ->data_tail) {            LOAD ->data_head
 *                      (A)             smp_rmb()       (C)
 *      STORE $data                     LOAD $data
 *      smp_wmb()       (B)             smp_mb()        (D)
 *      STORE ->data_head               STORE ->data_tail
 *   }
 *
 *   Where A pairs with D, and B pairs with C.
 *
 *   In our case A is a control dependency that separates the
 *   load of the ->data_tail and the stores of $data. In case
 *   ->data_tail indicates there is no room in the buffer to
 *   store $data we do not.
 *
 *   D needs to be a full barrier since it separates the data
 *   READ from the tail WRITE.
 *
 *   For B a WMB is sufficient since it separates two WRITEs,
 *   and for C an RMB is sufficient since it separates two READs.
 *
 * Note, instead of B, C, D we could also use smp_store_release()
 * in B and D as well as smp_load_acquire() in C.
 *
 * However, this optimization does not make sense for all kernel
 * supported architectures since for a fair number it would
 * resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),
 * and smp_mb() + WRITE_ONCE() pair for smp_store_release().
 *
 * Thus for those smp_wmb() in B and smp_rmb() in C would still
 * be less expensive. For the case of D this has either the same
 * cost or is less expensive, for example, due to TSO x86 can
 * avoid the CPU barrier entirely.
 */

static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)
{
/*
 * Architectures where smp_load_acquire() does not fallback to
 * READ_ONCE() + smp_mb() pair.
 */
#if defined(__x86_64__) || defined(__aarch64__) || defined(__powerpc64__) || \
    defined(__ia64__) || defined(__sparc__) && defined(__arch64__)
	return smp_load_acquire(&base->data_head);
#else
	u64 head = READ_ONCE(base->data_head);

	smp_rmb();
	return head;
#endif
}

static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,
					  u64 tail)
{
	smp_store_release(&base->data_tail, tail);
}

#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */
Initial commit 2023-08-30 17:53:23 +02:00			`#ifndef _TOOLS_LINUX_RING_BUFFER_H_`
			`#define _TOOLS_LINUX_RING_BUFFER_H_`

			`#include <asm/barrier.h>`
			`#include <linux/perf_event.h>`

			`/*`
			`* Contract with kernel for walking the perf ring buffer from`
			`* user space requires the following barrier pairing (quote`
			`* from kernel/events/ring_buffer.c):`
			`*`
			`* Since the mmap() consumer (userspace) can run on a`
			`* different CPU:`
			`*`
			`* kernel user`
			`*`
			`* if (LOAD ->data_tail) { LOAD ->data_head`
			`* (A) smp_rmb() (C)`
			`* STORE $data LOAD $data`
			`* smp_wmb() (B) smp_mb() (D)`
			`* STORE ->data_head STORE ->data_tail`
			`* }`
			`*`
			`* Where A pairs with D, and B pairs with C.`
			`*`
			`* In our case A is a control dependency that separates the`
			`* load of the ->data_tail and the stores of $data. In case`
			`* ->data_tail indicates there is no room in the buffer to`
			`* store $data we do not.`
			`*`
			`* D needs to be a full barrier since it separates the data`
			`* READ from the tail WRITE.`
			`*`
			`* For B a WMB is sufficient since it separates two WRITEs,`
			`* and for C an RMB is sufficient since it separates two READs.`
			`*`
			`* Note, instead of B, C, D we could also use smp_store_release()`
			`* in B and D as well as smp_load_acquire() in C.`
			`*`
			`* However, this optimization does not make sense for all kernel`
			`* supported architectures since for a fair number it would`
			`* resolve into READ_ONCE() + smp_mb() pair for smp_load_acquire(),`
			`* and smp_mb() + WRITE_ONCE() pair for smp_store_release().`
			`*`
			`* Thus for those smp_wmb() in B and smp_rmb() in C would still`
			`* be less expensive. For the case of D this has either the same`
			`* cost or is less expensive, for example, due to TSO x86 can`
			`* avoid the CPU barrier entirely.`
			`*/`

			`static inline u64 ring_buffer_read_head(struct perf_event_mmap_page *base)`
			`{`
			`/*`
			`* Architectures where smp_load_acquire() does not fallback to`
			`* READ_ONCE() + smp_mb() pair.`
			`*/`
			`#if defined(__x86_64__) \|\| defined(__aarch64__) \|\| defined(__powerpc64__) \|\| \`
			`defined(__ia64__) \|\| defined(__sparc__) && defined(__arch64__)`
			`return smp_load_acquire(&base->data_head);`
			`#else`
			`u64 head = READ_ONCE(base->data_head);`

			`smp_rmb();`
			`return head;`
			`#endif`
			`}`

			`static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base,`
			`u64 tail)`
			`{`
			`smp_store_release(&base->data_tail, tail);`
			`}`

			`#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */`