README.md
Rendering markdown...
package main
import (
"fmt"
"sync/atomic"
"syscall"
"unsafe"
)
const (
sysIoUringSetup = 425
sysIoUringEnter = 426
sysIoUringRegister = 427
iORING_OFF_SQ_RING = 0
iORING_OFF_CQ_RING = 0x8000000
iORING_OFF_SQES = 0x10000000
iORING_REGISTER_BUFFERS = 0
iORING_REGISTER_CLONE_BUFFERS = 30
iORING_OP_READ_FIXED = 4
iORING_ENTER_GETEVENTS = 1
)
// io_uring_params layout (kernel ABI)
type iouringParams struct {
sqEntries uint32
cqEntries uint32
flags uint32
sqThreadCPU uint32
sqThreadIdle uint32
features uint32
wqFd uint32
resv [3]uint32
sqOff sqringOffsets
cqOff cqringOffsets
}
type sqringOffsets struct {
head uint32
tail uint32
ringMask uint32
ringEntries uint32
flags uint32
dropped uint32
array uint32
resv1 uint32
userAddr uint64
}
type cqringOffsets struct {
head uint32
tail uint32
ringMask uint32
ringEntries uint32
overflow uint32
cqes uint32
flags uint32
resv1 uint32
userAddr uint64
}
// io_uring_sqe - 64 bytes
type iouringSQE struct {
opcode uint8
flags uint8
ioprio uint16
fd int32
off uint64
addr uint64
length uint32
rwFlags uint32
userData uint64
bufIndex uint16
personality uint16
spliceFdIn int32
addr3 uint64
pad2 uint64
}
// io_uring_cqe
type iouringCQE struct {
userData uint64
res int32
flags uint32
}
// io_uring_clone_buffers
type iouringCloneBuffers struct {
srcFd uint32
flags uint32
srcOff uint32
dstOff uint32
nr uint32
pad [3]uint32
}
type uring struct {
fd int
sqRing uintptr
cqRing uintptr
sqes uintptr
sqRingSz uintptr
cqRingSz uintptr
sqesSz uintptr
sqHead *uint32
sqTail *uint32
sqMask *uint32
sqArray *uint32
cqHead *uint32
cqTail *uint32
cqMask *uint32
cqes *iouringCQE
}
func uringSetup(entries uint32) (*uring, error) {
var p iouringParams
fd, _, errno := syscall.Syscall(sysIoUringSetup, uintptr(entries), uintptr(unsafe.Pointer(&p)), 0)
if errno != 0 {
return nil, fmt.Errorf("io_uring_setup: %w", errno)
}
r := &uring{fd: int(fd)}
r.sqRingSz = uintptr(p.sqOff.array) + uintptr(p.sqEntries)*4
r.cqRingSz = uintptr(p.cqOff.cqes) + uintptr(p.cqEntries)*16
r.sqesSz = uintptr(p.sqEntries) * 64
sqRing, _, errno := syscall.Syscall6(syscall.SYS_MMAP, 0, r.sqRingSz,
syscall.PROT_READ|syscall.PROT_WRITE,
syscall.MAP_SHARED|syscall.MAP_POPULATE,
uintptr(fd), iORING_OFF_SQ_RING)
if errno != 0 {
syscall.Close(int(fd))
return nil, fmt.Errorf("mmap sq_ring: %w", errno)
}
r.sqRing = sqRing
cqRing, _, errno := syscall.Syscall6(syscall.SYS_MMAP, 0, r.cqRingSz,
syscall.PROT_READ|syscall.PROT_WRITE,
syscall.MAP_SHARED|syscall.MAP_POPULATE,
uintptr(fd), iORING_OFF_CQ_RING)
if errno != 0 {
syscall.Munmap((*[1 << 30]byte)(unsafe.Pointer(sqRing))[:r.sqRingSz])
syscall.Close(int(fd))
return nil, fmt.Errorf("mmap cq_ring: %w", errno)
}
r.cqRing = cqRing
sqes, _, errno := syscall.Syscall6(syscall.SYS_MMAP, 0, r.sqesSz,
syscall.PROT_READ|syscall.PROT_WRITE,
syscall.MAP_SHARED|syscall.MAP_POPULATE,
uintptr(fd), iORING_OFF_SQES)
if errno != 0 {
syscall.Munmap((*[1 << 30]byte)(unsafe.Pointer(cqRing))[:r.cqRingSz])
syscall.Munmap((*[1 << 30]byte)(unsafe.Pointer(sqRing))[:r.sqRingSz])
syscall.Close(int(fd))
return nil, fmt.Errorf("mmap sqes: %w", errno)
}
r.sqes = sqes
r.sqHead = (*uint32)(unsafe.Pointer(sqRing + uintptr(p.sqOff.head)))
r.sqTail = (*uint32)(unsafe.Pointer(sqRing + uintptr(p.sqOff.tail)))
r.sqMask = (*uint32)(unsafe.Pointer(sqRing + uintptr(p.sqOff.ringMask)))
r.sqArray = (*uint32)(unsafe.Pointer(sqRing + uintptr(p.sqOff.array)))
r.cqHead = (*uint32)(unsafe.Pointer(cqRing + uintptr(p.cqOff.head)))
r.cqTail = (*uint32)(unsafe.Pointer(cqRing + uintptr(p.cqOff.tail)))
r.cqMask = (*uint32)(unsafe.Pointer(cqRing + uintptr(p.cqOff.ringMask)))
r.cqes = (*iouringCQE)(unsafe.Pointer(cqRing + uintptr(p.cqOff.cqes)))
logf("io_uring fd=%d sq_ring@%#x(sz=%#x) cq_ring@%#x(sz=%#x) sqes@%#x(sz=%#x)",
fd, sqRing, r.sqRingSz, cqRing, r.cqRingSz, sqes, r.sqesSz)
return r, nil
}
func uringRegisterBuffers(r *uring, buf uintptr, length uintptr) error {
iov := syscall.Iovec{Base: (*byte)(unsafe.Pointer(buf)), Len: uint64(length)}
_, _, errno := syscall.Syscall6(sysIoUringRegister,
uintptr(r.fd), iORING_REGISTER_BUFFERS,
uintptr(unsafe.Pointer(&iov)), 1, 0, 0)
if errno != 0 {
return fmt.Errorf("io_uring_register buffers: %w", errno)
}
return nil
}
func uringCloneBuffers(dst, src *uring) error {
arg := iouringCloneBuffers{srcFd: uint32(src.fd)}
_, _, errno := syscall.Syscall6(sysIoUringRegister,
uintptr(dst.fd), iORING_REGISTER_CLONE_BUFFERS,
uintptr(unsafe.Pointer(&arg)), 1, 0, 0)
if errno != 0 {
return fmt.Errorf("io_uring clone buffers: %w", errno)
}
return nil
}
func uringSubmitReadFixed(r *uring, fileFd int, buf uintptr, length uint32) error {
tail := atomic.LoadUint32(r.sqTail)
idx := tail & *r.sqMask
sqe := (*iouringSQE)(unsafe.Pointer(r.sqes + uintptr(idx)*64))
*sqe = iouringSQE{}
sqe.opcode = iORING_OP_READ_FIXED
sqe.fd = int32(fileFd)
sqe.off = 0
sqe.addr = uint64(buf)
sqe.length = length
sqe.bufIndex = 0
sqe.userData = 0x1234
*(*uint32)(unsafe.Pointer(uintptr(unsafe.Pointer(r.sqArray)) + uintptr(idx)*4)) = idx
atomic.StoreUint32(r.sqTail, tail+1)
_, _, errno := syscall.Syscall6(sysIoUringEnter,
uintptr(r.fd), 1, 1, iORING_ENTER_GETEVENTS, 0, 0)
if errno != 0 {
return fmt.Errorf("io_uring_enter: %w", errno)
}
return nil
}
func uringWaitCQE(r *uring) (int32, error) {
head := atomic.LoadUint32(r.cqHead)
for i := 0; i < 1000; i++ {
tail := atomic.LoadUint32(r.cqTail)
if head != tail {
break
}
syscall.Nanosleep(&syscall.Timespec{Nsec: 1_000_000}, nil)
}
tail := atomic.LoadUint32(r.cqTail)
if head == tail {
return 0, fmt.Errorf("CQE timeout")
}
idx := head & *r.cqMask
cqe := (*iouringCQE)(unsafe.Pointer(uintptr(unsafe.Pointer(r.cqes)) + uintptr(idx)*16))
res := cqe.res
atomic.StoreUint32(r.cqHead, head+1)
return res, nil
}
func uringDestroy(r *uring) {
if r == nil {
return
}
if r.sqes != 0 {
syscall.Munmap((*[1 << 30]byte)(unsafe.Pointer(r.sqes))[:r.sqesSz])
}
if r.cqRing != 0 {
syscall.Munmap((*[1 << 30]byte)(unsafe.Pointer(r.cqRing))[:r.cqRingSz])
}
if r.sqRing != 0 {
syscall.Munmap((*[1 << 30]byte)(unsafe.Pointer(r.sqRing))[:r.sqRingSz])
}
if r.fd >= 0 {
syscall.Close(r.fd)
r.fd = -1
}
}