1 /** 2 * io_uring system api definitions. 3 * 4 * See: https://github.com/torvalds/linux/blob/master/include/uapi/linux/io_uring.h 5 * 6 * Last changes from: 760618f7a8e3b63aa06266efb301719c374e29d4 (20200724) 7 */ 8 module during.io_uring; 9 10 version (linux): 11 12 import core.sys.posix.poll; 13 import core.sys.posix.signal; 14 15 @system nothrow @nogc: 16 17 /** 18 * IO operation submission data structure (Submission queue entry). 19 * 20 * C API: `struct io_uring_sqe` 21 */ 22 struct SubmissionEntry 23 { 24 Operation opcode; /// type of operation for this sqe 25 SubmissionEntryFlags flags; /// IOSQE_ flags 26 ushort ioprio; /// ioprio for the request 27 int fd; /// file descriptor to do IO on 28 union 29 { 30 ulong off; /// offset into file 31 ulong addr2; /// from Linux 5.5 32 } 33 34 union 35 { 36 ulong addr; /// pointer to buffer or iovecs 37 ulong splice_off_in; 38 } 39 uint len; /// buffer size or number of iovecs 40 41 union 42 { 43 ReadWriteFlags rw_flags; 44 FsyncFlags fsync_flags; 45 PollEvents poll_events; // changed in https://github.com/torvalds/linux/commit/5769a351b89cd4d82016f18fa5f6c4077403564d 46 uint poll_events32; /// from Linux 5.9 - word-reversed for BE 47 SyncFileRangeFlags sync_range_flags; /// from Linux 5.2 48 MsgFlags msg_flags; /// from Linux 5.3 49 TimeoutFlags timeout_flags; /// from Linux 5.4 50 AcceptFlags accept_flags; /// from Linux 5.5 51 uint cancel_flags; /// from Linux 5.5 52 uint open_flags; /// from Linux 5.6 53 uint statx_flags; /// from Linux 5.6 54 uint fadvise_advice; /// from Linux 5.6 55 uint splice_flags; /// from Linux 5.7 56 } 57 58 ulong user_data; /// data to be passed back at completion time 59 60 union 61 { 62 struct 63 { 64 union 65 { 66 ushort buf_index; /// index into fixed buffers, if used 67 ushort buf_group; /// for grouped buffer selection 68 } 69 ushort personality; /// personality to use, if used 70 int splice_fd_in; 71 } 72 73 ulong[3] __pad2; 74 } 75 76 /// Resets entry fields 77 void clear() @safe nothrow @nogc 78 { 79 this = SubmissionEntry.init; 80 } 81 } 82 83 enum ReadWriteFlags : int 84 { 85 NONE = 0, 86 87 /// High priority read/write. Allows block-based filesystems to 88 /// use polling of the device, which provides lower latency, but 89 /// may use additional resources. (Currently, this feature is 90 /// usable only on a file descriptor opened using the 91 /// O_DIRECT flag.) 92 /// 93 /// (since Linux 4.6) 94 HIPRI = 0x00000001, 95 96 /// Provide a per-write equivalent of the O_DSYNC open(2) flag. 97 /// This flag is meaningful only for pwritev2(), and its effect 98 /// applies only to the data range written by the system call. 99 /// 100 /// (since Linux 4.7) 101 DSYNC = 0x00000002, 102 103 /// Provide a per-write equivalent of the O_SYNC open(2) flag. 104 /// This flag is meaningful only for pwritev2(), and its effect 105 /// applies only to the data range written by the system call. 106 /// 107 /// (since Linux 4.7) 108 SYNC = 0x00000004, 109 110 /// Do not wait for data which is not immediately available. If 111 /// this flag is specified, the preadv2() system call will 112 /// return instantly if it would have to read data from the 113 /// backing storage or wait for a lock. If some data was 114 /// successfully read, it will return the number of bytes read. 115 /// If no bytes were read, it will return -1 and set errno to 116 /// EAGAIN. Currently, this flag is meaningful only for 117 /// preadv2(). 118 /// 119 /// (since Linux 4.14) 120 NOWAIT = 0x00000008, 121 122 /// Provide a per-write equivalent of the O_APPEND open(2) flag. 123 /// This flag is meaningful only for pwritev2(), and its effect 124 /// applies only to the data range written by the system call. 125 /// The offset argument does not affect the write operation; the 126 /// data is always appended to the end of the file. However, if 127 /// the offset argument is -1, the current file offset is 128 /// updated. 129 /// 130 /// (since Linux 4.16) 131 APPEND = 0x00000010 132 } 133 134 enum FsyncFlags : uint 135 { 136 /// Normal file integrity sync 137 NORMAL = 0, 138 139 /** 140 * `fdatasync` semantics. 141 * 142 * See_Also: `fsync(2)` for details 143 */ 144 DATASYNC = (1 << 0) 145 } 146 147 /** Possible poll event flags. 148 * See: poll(2) 149 */ 150 enum PollEvents : uint 151 { 152 /// There is data to read. 153 IN = POLLIN, 154 155 /** Writing is now possible, though a write larger that the available 156 * space in a socket or pipe will still block (unless O_NONBLOCK is set). 157 */ 158 OUT = POLLOUT, 159 160 /** There is some exceptional condition on the file descriptor. 161 * Possibilities include: 162 * 163 * * There is out-of-band data on a TCP socket (see tcp(7)). 164 * * A pseudoterminal master in packet mode has seen a state 165 * change on the slave (see ioctl_tty(2)). 166 * * A cgroup.events file has been modified (see cgroups(7)). 167 */ 168 PRI = POLLPRI, 169 170 /** Error condition (only returned in revents; ignored in events). 171 * This bit is also set for a file descriptor referring to the 172 * write end of a pipe when the read end has been closed. 173 */ 174 ERR = POLLERR, 175 176 /// Invalid request: fd not open (only returned in revents; ignored in events). 177 NVAL = POLLNVAL, 178 179 RDNORM = POLLRDNORM, /// Equivalent to POLLIN. 180 RDBAND = POLLRDBAND, /// Priority band data can be read (generally unused on Linux). 181 WRNORM = POLLWRNORM, /// Equivalent to POLLOUT. 182 WRBAND = POLLWRBAND, /// Priority data may be written. 183 184 /** Hang up (only returned in revents; ignored in events). Note 185 * that when reading from a channel such as a pipe or a stream 186 * socket, this event merely indicates that the peer closed its 187 * end of the channel. Subsequent reads from the channel will 188 * return 0 (end of file) only after all outstanding data in the 189 * channel has been consumed. 190 */ 191 HUP = POLLHUP, 192 193 /** (since Linux 2.6.17) 194 * Stream socket peer closed connection, or shut down writing half of connection. 195 */ 196 RDHUP = 0x2000, 197 198 /** (since Linux 4.5) 199 * Sets an exclusive wakeup mode for the epoll file descriptor that is being attached to the 200 * target file descriptor, fd. When a wakeup event occurs and multiple epoll file descriptors 201 * are attached to the same target file using EPOLLEXCLUSIVE, one or more of the epoll file 202 * descriptors will receive an event with epoll_wait(2). The default in this scenario (when 203 * EPOLLEXCLUSIVE is not set) is for all epoll file descriptors to receive an event. 204 * EPOLLEXCLUSIVE is thus useful for avoiding thundering herd problems in certain scenarios. 205 */ 206 EXCLUSIVE = 0x10000000, 207 } 208 209 /** 210 * Flags for `sync_file_range(2)` operation. 211 * 212 * See_Also: `sync_file_range(2)` for details 213 */ 214 enum SyncFileRangeFlags : uint 215 { 216 NOOP = 0, /// no operation 217 /// Wait upon write-out of all pages in the specified range that have already been submitted to 218 /// the device driver for write-out before performing any write. 219 WAIT_BEFORE = 1U << 0, 220 221 /// Initiate write-out of all dirty pages in the specified range which are not presently 222 /// submitted write-out. Note that even this may block if you attempt to write more than 223 /// request queue size. 224 WRITE = 1U << 1, 225 226 /// Wait upon write-out of all pages in the range after performing any write. 227 WAIT_AFTER = 1U << 2, 228 229 /// This is a write-for-data-integrity operation that will ensure that all pages in the 230 /// specified range which were dirty when sync_file_range() was called are committed to disk. 231 WRITE_AND_WAIT = WAIT_BEFORE | WRITE | WAIT_AFTER 232 } 233 234 /** 235 * Flags for `sendmsg(2)` and `recvmsg(2)` operations. 236 * 237 * See_Also: man pages for the operations. 238 */ 239 enum MsgFlags : uint 240 { 241 /// No flags defined 242 NONE = 0, 243 244 /// Sends out-of-band data on sockets that support this notion (e.g., of type `SOCK_STREAM`); the 245 /// underlying protocol must also support out-of-band data. 246 OOB = 0x01, 247 248 /// This flag causes the receive operation to return data from the beginning of the receive 249 /// queue without removing that data from the queue. Thus, a subsequent receive call will return 250 /// the same data. 251 PEEK = 0x02, 252 253 /// Don't use a gateway to send out the packet, send to hosts only on directly connected 254 /// networks. This is usually used only by diagnostic or routing programs. This is defined only 255 /// for protocol families that route; packet sockets don't. 256 DONTROUTE = 0x04, 257 258 /// For raw (`AF_PACKET`), Internet datagram (since Linux 2.4.27/2.6.8), netlink (since Linux 259 /// 2.6.22), and UNIX datagram (since Linux 3.4) sockets: return the real length of the packet 260 /// or datagram, even when it was longer than the passed buffer. 261 /// 262 /// For use with Internet stream sockets, see `tcp(7)`. 263 TRUNC = 0x20, 264 265 /// Enables nonblocking operation; if the operation would block, EAGAIN or EWOULDBLOCK is 266 /// returned. This provides similar behavior to setting the O_NONBLOCK flag (via the `fcntl(2)` 267 /// F_SETFL operation), but differs in that `MSG_DONTWAIT` is a per-call option, whereas 268 /// `O_NONBLOCK` is a setting on the open file description (see `open(2)`), which will affect 269 /// all threads in the calling process and as well as other processes that hold file descriptors 270 /// referring to the same open file description. 271 DONTWAIT = 0x40, 272 273 /// Terminates a record (when this notion is supported, as for sockets of type `SOCK_SEQPACKET`). 274 EOR = 0x80, 275 276 /// This flag requests that the operation block until the full request is satisfied. However, 277 /// the call may still return less data than requested if a signal is caught, an error or 278 /// disconnect occurs, or the next data to be received is of a different type than that 279 /// returned. This flag has no effect for datagram sockets. 280 WAITALL = 0x100, 281 282 /// Tell the link layer that forward progress happened: you got a successful reply from the 283 /// other side. If the link layer doesn't get this it will regularly reprobe the neighbor (e.g., 284 /// via a unicast ARP). Valid only on SOCK_DGRAM and SOCK_RAW sockets and currently 285 /// implemented only for IPv4 and IPv6. See arp(7) for details. 286 CONFIRM = 0x800, 287 288 /// This flag specifies that queued errors should be received from the socket error queue. The 289 /// error is passed in an ancillary message with a type dependent on the protocol (for IPv4 290 /// `IP_RECVERR`). The user should supply a buffer of sufficient size. See `cmsg(3)` and `ip(7)` 291 /// for more information. The payload of the original packet that caused the error is passed as 292 /// normal data via msg_iovec. The original destination address of the datagram that caused the 293 /// error is supplied via `msg_name`. 294 ERRQUEUE = 0x2000, 295 296 /// Don't generate a `SIGPIPE` signal if the peer on a stream-oriented socket has closed the 297 /// connection. The `EPIPE` error is still returned. This provides similar behavior to using 298 /// `sigaction(2)` to ignore `SIGPIPE`, but, whereas `MSG_NOSIGNAL` is a per-call feature, 299 /// ignoring `SIGPIPE` sets a process attribute that affects all threads in the process. 300 NOSIGNAL = 0x4000, 301 302 /// The caller has more data to send. This flag is used with TCP sockets to obtain the same 303 /// effect as the `TCP_CORK` socket option (see `tcp(7)`), with the difference that this flag can be 304 /// set on a per-call basis. 305 /// 306 /// Since Linux 2.6, this flag is also supported for UDP sockets, and informs the kernel to 307 /// package all of the data sent in calls with this flag set into a single datagram which is 308 /// transmitted only when a call is performed that does not specify this flag. 309 /// 310 /// See_Also: the `UDP_CORK` socket option described in `udp(7)` 311 MORE = 0x8000, 312 313 /// Set the close-on-exec flag for the file descriptor received via a UNIX domain file 314 /// descriptor using the `SCM_RIGHTS` operation (described in `unix(7)`). This flag is useful 315 /// for the same reasons as the `O_CLOEXEC` flag of `open(2)`. (recvmsg only) 316 CMSG_CLOEXEC = 0x40000000 317 } 318 319 /** sqe->timeout_flags 320 */ 321 enum TimeoutFlags : uint 322 { 323 REL = 0, /// Relative time is the default 324 ABS = 1U << 0 /// Absolute time - `IORING_TIMEOUT_ABS` (from Linux 5.5) 325 } 326 327 /** 328 * sqe->splice_flags 329 * extends splice(2) flags 330 */ 331 enum SPLICE_F_FD_IN_FIXED = 1U << 31; /* the last bit of __u32 */ 332 333 /** 334 * Flags that can be used with the `accept4(2)` operation. 335 */ 336 enum AcceptFlags : uint 337 { 338 /// Same as `accept()` 339 NONE = 0, 340 341 /// Set the `O_NONBLOCK` file status flag on the new open file description. Using this flag saves 342 /// extra calls to `fcntl(2)` to achieve the same result. 343 NONBLOCK = 0x800, // octal 00004000 344 345 /// Set the close-on-exec (`FD_CLOEXEC`) flag on the new file descriptor. See the description of 346 /// the `O_CLOEXEC` flag in `open(2)` for reasons why this may be useful. 347 CLOEXEC = 0x80000 // octal 02000000 348 } 349 350 /** 351 * Describes the operation to be performed 352 * 353 * See_Also: `io_uring_enter(2)` 354 */ 355 enum Operation : ubyte 356 { 357 // available from Linux 5.1 358 NOP = 0, /// IORING_OP_NOP 359 READV = 1, /// IORING_OP_READV 360 WRITEV = 2, /// IORING_OP_WRITEV 361 FSYNC = 3, /// IORING_OP_FSYNC 362 READ_FIXED = 4, /// IORING_OP_READ_FIXED 363 WRITE_FIXED = 5, /// IORING_OP_WRITE_FIXED 364 POLL_ADD = 6, /// IORING_OP_POLL_ADD 365 POLL_REMOVE = 7, /// IORING_OP_POLL_REMOVE 366 367 // available from Linux 5.2 368 SYNC_FILE_RANGE = 8, /// IORING_OP_SYNC_FILE_RANGE 369 370 // available from Linux 5.3 371 SENDMSG = 9, /// IORING_OP_SENDMSG 372 RECVMSG = 10, /// IORING_OP_RECVMSG 373 374 // available from Linux 5.4 375 TIMEOUT = 11, /// IORING_OP_TIMEOUT 376 377 // available from Linux 5.5 378 TIMEOUT_REMOVE = 12, /// IORING_OP_TIMEOUT_REMOVE 379 ACCEPT = 13, /// IORING_OP_ACCEPT 380 ASYNC_CANCEL = 14, /// IORING_OP_ASYNC_CANCEL 381 LINK_TIMEOUT = 15, /// IORING_OP_LINK_TIMEOUT 382 CONNECT = 16, /// IORING_OP_CONNECT 383 384 // available from Linux 5.6 385 FALLOCATE = 17, /// IORING_OP_FALLOCATE 386 OPENAT = 18, /// IORING_OP_OPENAT 387 CLOSE = 19, /// IORING_OP_CLOSE 388 FILES_UPDATE = 20, /// IORING_OP_FILES_UPDATE 389 STATX = 21, /// IORING_OP_STATX 390 READ = 22, /// IORING_OP_READ 391 WRITE = 23, /// IORING_OP_WRITE 392 FADVISE = 24, /// IORING_OP_FADVISE 393 MADVISE = 25, /// IORING_OP_MADVISE 394 SEND = 26, /// IORING_OP_SEND 395 RECV = 27, /// IORING_OP_RECV 396 OPENAT2 = 28, /// IORING_OP_OPENAT2 397 EPOLL_CTL = 29, /// IORING_OP_EPOLL_CTL 398 399 // available from Linux 5.7 400 SPLICE = 30, /// IORING_OP_SPLICE 401 PROVIDE_BUFFERS = 31, /// IORING_OP_PROVIDE_BUFFERS 402 REMOVE_BUFFERS = 32, /// IORING_OP_REMOVE_BUFFERS 403 404 // available from Linux 5.8 405 TEE = 33, /// IORING_OP_TEE 406 } 407 408 /// sqe->flags 409 enum SubmissionEntryFlags : ubyte 410 { 411 NONE = 0, 412 413 /// Use fixed fileset (`IOSQE_FIXED_FILE`) 414 /// 415 /// When this flag is specified, fd is an index into the files array registered with the 416 /// io_uring instance (see the `IORING_REGISTER_FILES` section of the io_uring_register(2) man 417 /// page). 418 FIXED_FILE = 1U << 0, 419 420 /** 421 * `IOSQE_IO_DRAIN`: issue after inflight IO 422 * 423 * If a request is marked with `IO_DRAIN`, then previous commands must complete before this one 424 * is issued. Subsequent requests are not started until the drain has completed. 425 * 426 * Note: available from Linux 5.2 427 */ 428 IO_DRAIN = 1U << 1, 429 430 /** 431 * `IOSQE_IO_LINK` 432 * 433 * If set, the next SQE in the ring will depend on this SQE. A dependent SQE will not be started 434 * until the parent SQE has completed. If the parent SQE fails, then a dependent SQE will be 435 * failed without being started. Link chains can be arbitrarily long, the chain spans any new 436 * SQE that continues tohave the IOSQE_IO_LINK flag set. Once an SQE is encountered that does 437 * not have this flag set, that defines the end of the chain. This features allows to form 438 * dependencies between individual SQEs. 439 * 440 * Note: available from Linux 5.3 441 */ 442 IO_LINK = 1U << 2, 443 444 /** 445 * `IOSQE_IO_HARDLINK` - like LINK, but stronger 446 * 447 * Some commands will invariably end in a failure in the sense that the 448 * completion result will be less than zero. One such example is timeouts 449 * that don't have a completion count set, they will always complete with 450 * `-ETIME` unless cancelled. 451 * 452 * For linked commands, we sever links and fail the rest of the chain if 453 * the result is less than zero. Since we have commands where we know that 454 * will happen, add IOSQE_IO_HARDLINK as a stronger link that doesn't sever 455 * regardless of the completion result. Note that the link will still sever 456 * if we fail submitting the parent request, hard links are only resilient 457 * in the presence of completion results for requests that did submit 458 * correctly. 459 * 460 * Note: available from Linux 5.5 461 */ 462 IO_HARDLINK = 1U << 3, 463 464 /** 465 * `IOSQE_ASYNC` 466 * 467 * io_uring defaults to always doing inline submissions, if at all possible. But for larger 468 * copies, even if the data is fully cached, that can take a long time. Add an IOSQE_ASYNC flag 469 * that the application can set on the SQE - if set, it'll ensure that we always go async for 470 * those kinds of requests. 471 * 472 * Note: available from Linux 5.6 473 */ 474 ASYNC = 1U << 4, /* always go async */ 475 476 /** 477 * `IOSQE_BUFFER_SELECT` 478 * If a server process has tons of pending socket connections, generally it uses epoll to wait 479 * for activity. When the socket is ready for reading (or writing), the task can select a buffer 480 * and issue a recv/send on the given fd. 481 * 482 * Now that we have fast (non-async thread) support, a task can have tons of pending reads or 483 * writes pending. But that means they need buffers to back that data, and if the number of 484 * connections is high enough, having them preallocated for all possible connections is 485 * unfeasible. 486 * 487 * With IORING_OP_PROVIDE_BUFFERS, an application can register buffers to use for any request. 488 * The request then sets IOSQE_BUFFER_SELECT in the sqe, and a given group ID in sqe->buf_group. 489 * When the fd becomes ready, a free buffer from the specified group is selected. If none are 490 * available, the request is terminated with -ENOBUFS. If successful, the CQE on completion will 491 * contain the buffer ID chosen in the cqe->flags member, encoded as: 492 * 493 * `(buffer_id << IORING_CQE_BUFFER_SHIFT) | IORING_CQE_F_BUFFER;` 494 * 495 * Once a buffer has been consumed by a request, it is no longer available and must be 496 * registered again with IORING_OP_PROVIDE_BUFFERS. 497 * 498 * Requests need to support this feature. For now, IORING_OP_READ and IORING_OP_RECV support it. 499 * This is checked on SQE submission, a CQE with res == -EOPNOTSUPP will be posted if attempted 500 * on unsupported requests. 501 * 502 * Note: available from Linux 5.7 503 */ 504 BUFFER_SELECT = 1U << 5, /* select buffer from sqe->buf_group */ 505 } 506 507 /** 508 * IO completion data structure (Completion Queue Entry) 509 * 510 * C API: `struct io_uring_cqe` 511 */ 512 struct CompletionEntry 513 { 514 ulong user_data; /** sqe->data submission passed back */ 515 int res; /** result code for this event */ 516 CQEFlags flags; 517 } 518 519 /// Flags used with `CompletionEntry` 520 enum CQEFlags : uint 521 { 522 NONE = 0, /// No flags set 523 524 /// `IORING_CQE_F_BUFFER` If set, the upper 16 bits are the buffer ID 525 /// Note: available from Linux 5.7 526 BUFFER = 1U << 0 527 } 528 529 /** 530 * Passed in for io_uring_setup(2). Copied back with updated info on success. 531 * 532 * C API: `struct io_uring_params` 533 */ 534 struct SetupParameters 535 { 536 // Magic offsets for the application to mmap the data it needs 537 538 /// `IORING_OFF_SQ_RING`: mmap offset for submission queue ring 539 enum ulong SUBMISSION_QUEUE_RING_OFFSET = 0UL; 540 /// `IORING_OFF_CQ_RING`: mmap offset for completion queue ring 541 enum ulong COMPLETION_QUEUE_RING_OFFSET = 0x8000000UL; 542 /// `IORING_OFF_SQES`: mmap offset for submission entries 543 enum ulong SUBMISSION_QUEUE_ENTRIES_OFFSET = 0x10000000UL; 544 545 /// (output) allocated entries in submission queue 546 /// (both ring index `array` and separate entry array at `SUBMISSION_QUEUE_ENTRIES_OFFSET`). 547 uint sq_entries; 548 549 /// (output) allocated entries in completion queue 550 uint cq_entries; 551 552 SetupFlags flags; /// (input) 553 554 /// (input) used if SQ_AFF and SQPOLL flags are active to pin poll thread to specific cpu. 555 /// right now always checked in kernel for "possible cpu". 556 uint sq_thread_cpu; 557 558 /// (input) used if SQPOLL flag is active; timeout in milliseconds 559 /// until kernel poll thread goes to sleep. 560 uint sq_thread_idle; 561 SetupFeatures features; /// (from Linux 5.4) 562 uint wq_fd; /// (from Linux 5.6) 563 private uint[3] resv; // reserved 564 SubmissionQueueRingOffsets sq_off; /// (output) submission queue ring data field offsets 565 CompletionQueueRingOffsets cq_off; /// (output) completion queue ring data field offsets 566 } 567 568 /// `io_uring_setup()` flags 569 enum SetupFlags : uint 570 { 571 /// No flags set 572 NONE = 0, 573 574 /** 575 * `IORING_SETUP_IOPOLL` 576 * 577 * Perform busy-waiting for an I/O completion, as opposed to getting notifications via an 578 * asynchronous IRQ (Interrupt Request). The file system (if any) and block device must 579 * support polling in order for this to work. Busy-waiting provides lower latency, but may 580 * consume more CPU resources than interrupt driven I/O. Currently, this feature is usable 581 * only on a file descriptor opened using the O_DIRECT flag. When a read or write is submitted 582 * to a polled context, the application must poll for completions on the CQ ring by calling 583 * io_uring_enter(2). It is illegal to mix and match polled and non-polled I/O on an io_uring 584 * instance. 585 */ 586 IOPOLL = 1U << 0, 587 588 /** 589 * `IORING_SETUP_SQPOLL` 590 * 591 * When this flag is specified, a kernel thread is created to perform submission queue polling. 592 * An io_uring instance configured in this way enables an application to issue I/O without ever 593 * context switching into the kernel. 594 * By using the submission queue to fill in new submission queue entries and watching for 595 * completions on the completion queue, the application can submit and reap I/Os without doing 596 * a single system call. 597 * If the kernel thread is idle for more than sq_thread_idle microseconds, it will set the 598 * IORING_SQ_NEED_WAKEUP bit in the flags field of the struct io_sq_ring. When this happens, 599 * the application must call io_uring_enter(2) to wake the kernel thread. If I/O is kept busy, 600 * the kernel thread will never sleep. An application making use of this feature will need to 601 * guard the io_uring_enter(2) call with the following code sequence: 602 * 603 * ```` 604 * // Ensure that the wakeup flag is read after the tail pointer has been written. 605 * smp_mb(); 606 * if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) 607 * io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); 608 * ``` 609 * 610 * where sq_ring is a submission queue ring setup using the struct io_sqring_offsets described below. 611 * 612 * To successfully use this feature, the application must register a set of files to be used for 613 * IO through io_uring_register(2) using the IORING_REGISTER_FILES opcode. Failure to do so will 614 * result in submitted IO being errored with EBADF. 615 */ 616 SQPOLL = 1U << 1, 617 618 /** 619 * `IORING_SETUP_SQ_AFF` 620 * 621 * If this flag is specified, then the poll thread will be bound to the cpu set in the 622 * sq_thread_cpu field of the struct io_uring_params. This flag is only meaningful when 623 * IORING_SETUP_SQPOLL is specified. 624 */ 625 SQ_AFF = 1U << 2, 626 627 /** 628 * `IORING_SETUP_CQSIZE` 629 * 630 * Create the completion queue with struct io_uring_params.cq_entries entries. The value must 631 * be greater than entries, and may be rounded up to the next power-of-two. 632 * 633 * Note: Available from Linux 5.5 634 */ 635 CQSIZE = 1U << 3, 636 637 /** 638 * `IORING_SETUP_CLAMP` 639 * 640 * Some applications like to start small in terms of ring size, and then ramp up as needed. This 641 * is a bit tricky to do currently, since we don't advertise the max ring size. 642 * 643 * This adds IORING_SETUP_CLAMP. If set, and the values for SQ or CQ ring size exceed what we 644 * support, then clamp them at the max values instead of returning -EINVAL. Since we return the 645 * chosen ring sizes after setup, no further changes are needed on the application side. 646 * io_uring already changes the ring sizes if the application doesn't ask for power-of-two 647 * sizes, for example. 648 * 649 * Note: Available from Linux 5.6 650 */ 651 CLAMP = 1U << 4, /* clamp SQ/CQ ring sizes */ 652 653 /** 654 * `IORING_SETUP_ATTACH_WQ` 655 * 656 * If IORING_SETUP_ATTACH_WQ is set, it expects wq_fd in io_uring_params to be a valid io_uring 657 * fd io-wq of which will be shared with the newly created io_uring instance. If the flag is set 658 * but it can't share io-wq, it fails. 659 * 660 * This allows creation of "sibling" io_urings, where we prefer to keep the SQ/CQ private, but 661 * want to share the async backend to minimize the amount of overhead associated with having 662 * multiple rings that belong to the same backend. 663 * 664 * Note: Available from Linux 5.6 665 */ 666 ATTACH_WQ = 1U << 5, /* attach to existing wq */ 667 } 668 669 /// `io_uring_params->features` flags 670 enum SetupFeatures : uint 671 { 672 NONE = 0, 673 674 /** 675 * `IORING_FEAT_SINGLE_MMAP` (from Linux 5.4) 676 * 677 * Indicates that we can use single mmap feature to map both sq and cq rings and so to avoid the 678 * second mmap. 679 */ 680 SINGLE_MMAP = 1U << 0, 681 682 /** 683 * `IORING_FEAT_NODROP` (from Linux 5.5) 684 * 685 * Currently we drop completion events, if the CQ ring is full. That's fine 686 * for requests with bounded completion times, but it may make it harder or 687 * impossible to use io_uring with networked IO where request completion 688 * times are generally unbounded. Or with POLL, for example, which is also 689 * unbounded. 690 * 691 * After this patch, we never overflow the ring, we simply store requests 692 * in a backlog for later flushing. This flushing is done automatically by 693 * the kernel. To prevent the backlog from growing indefinitely, if the 694 * backlog is non-empty, we apply back pressure on IO submissions. Any 695 * attempt to submit new IO with a non-empty backlog will get an -EBUSY 696 * return from the kernel. This is a signal to the application that it has 697 * backlogged CQ events, and that it must reap those before being allowed 698 * to submit more IO. 699 * 700 * Note that if we do return -EBUSY, we will have filled whatever 701 * backlogged events into the CQ ring first, if there's room. This means 702 * the application can safely reap events WITHOUT entering the kernel and 703 * waiting for them, they are already available in the CQ ring. 704 */ 705 NODROP = 1U << 1, 706 707 /** 708 * `IORING_FEAT_SUBMIT_STABLE` (from Linux 5.5) 709 * 710 * If this flag is set, applications can be certain that any data for async offload has been 711 * consumed when the kernel has consumed the SQE. 712 */ 713 SUBMIT_STABLE = 1U << 2, 714 715 /** 716 * `IORING_FEAT_RW_CUR_POS` (from Linux 5.6) 717 * 718 * If this flag is set, applications can know if setting `-1` as file offsets (meaning to work 719 * with current file position) is supported. 720 */ 721 RW_CUR_POS = 1U << 3, 722 723 /** 724 * `IORING_FEAT_CUR_PERSONALITY` (from Linux 5.6) 725 * We currently setup the io_wq with a static set of mm and creds. Even for a single-use io-wq 726 * per io_uring, this is suboptimal as we have may have multiple enters of the ring. For 727 * sharing the io-wq backend, it doesn't work at all. 728 * 729 * Switch to passing in the creds and mm when the work item is setup. This means that async 730 * work is no longer deferred to the io_uring mm and creds, it is done with the current mm and 731 * creds. 732 * 733 * Flag this behavior with IORING_FEAT_CUR_PERSONALITY, so applications know they can rely on 734 * the current personality (mm and creds) being the same for direct issue and async issue. 735 */ 736 CUR_PERSONALITY = 1U << 4, 737 738 /** 739 * `IORING_FEAT_FAST_POLL` (from Linux 5.7) 740 * Currently io_uring tries any request in a non-blocking manner, if it can, and then retries 741 * from a worker thread if we get -EAGAIN. Now that we have a new and fancy poll based retry 742 * backend, use that to retry requests if the file supports it. 743 * 744 * This means that, for example, an IORING_OP_RECVMSG on a socket no longer requires an async 745 * thread to complete the IO. If we get -EAGAIN reading from the socket in a non-blocking 746 * manner, we arm a poll handler for notification on when the socket becomes readable. When it 747 * does, the pending read is executed directly by the task again, through the io_uring task 748 * work handlers. Not only is this faster and more efficient, it also means we're not 749 * generating potentially tons of async threads that just sit and block, waiting for the IO to 750 * complete. 751 * 752 * The feature is marked with IORING_FEAT_FAST_POLL, meaning that async pollable IO is fast, 753 * and that poll<link>other_op is fast as well. 754 */ 755 FAST_POLL = 1U << 5, 756 757 /** 758 * `IORING_FEAT_POLL_32BITS` (from Linux 5.9) 759 * Poll events should be 32-bits to cover EPOLLEXCLUSIVE. 760 * Explicit word-swap the poll32_events for big endian to make sure the ABI is not changed. We 761 * call this feature IORING_FEAT_POLL_32BITS, applications who want to use EPOLLEXCLUSIVE should 762 * check the feature bit first. 763 */ 764 POLL_32BITS = 1U << 6 765 } 766 767 /** 768 * Filled with the offset for mmap(2) 769 * 770 * C API: `struct io_sqring_offsets` 771 */ 772 struct SubmissionQueueRingOffsets 773 { 774 /// Incremented by kernel after entry at `head` was processed. 775 /// Pending submissions: [head..tail] 776 uint head; 777 778 /// Modified by user space when new entry was queued; points to next 779 /// entry user space is going to fill. 780 uint tail; 781 782 /// value `value_at(self.ring_entries) - 1` 783 /// mask for indices at `head` and `tail` (don't delete masked bits! 784 /// `head` and `tail` can point to the same entry, but if they are 785 /// not exactly equal it implies the ring is full, and if they are 786 /// exactly equal the ring is empty.) 787 uint ring_mask; 788 789 /// value same as SetupParameters.sq_entries, power of 2. 790 uint ring_entries; 791 792 /// SubmissionQueueFlags 793 SubmissionQueueFlags flags; 794 795 /// number of (invalid) entries that were dropped; entries are 796 /// invalid if their index (in `array`) is out of bounds. 797 uint dropped; 798 799 /// index into array of `SubmissionEntry`s at offset `SUBMISSION_QUEUE_ENTRIES_OFFSET` in mmap() 800 uint array; 801 802 private uint[3] resv; // reserved 803 } 804 805 enum SubmissionQueueFlags: uint 806 { 807 NONE = 0, 808 809 /// `IORING_SQ_NEED_WAKEUP`: needs io_uring_enter wakeup 810 /// set by kernel poll thread when it goes sleeping, and reset on wakeup 811 NEED_WAKEUP = 1U << 0, 812 813 /// `IORING_SQ_CQ_OVERFLOW`: CQ ring is overflown 814 /// Since Kernel 5.8 815 /// For those applications which are not willing to use io_uring_enter() to reap and handle 816 /// cqes, they may completely rely on liburing's io_uring_peek_cqe(), but if cq ring has 817 /// overflowed, currently because io_uring_peek_cqe() is not aware of this overflow, it won't 818 /// enter kernel to flush cqes. 819 /// To fix this issue, export cq overflow status to userspace by adding new 820 /// IORING_SQ_CQ_OVERFLOW flag, then helper functions() in liburing, such as io_uring_peek_cqe, 821 /// can be aware of this cq overflow and do flush accordingly. 822 CQ_OVERFLOW = 1U << 1 823 } 824 825 /** 826 * Field offsets used to map kernel structure to our. 827 * 828 * C API: `struct io_cqring_offsets` 829 */ 830 struct CompletionQueueRingOffsets 831 { 832 /// incremented by user space after entry at `head` was processed. 833 /// available entries for processing: [head..tail] 834 uint head; 835 836 /// modified by kernel when new entry was created; points to next 837 /// entry kernel is going to fill. 838 uint tail; 839 840 /// value `value_at(ring_entries) - 1` 841 /// mask for indices at `head` and `tail` (don't delete masked bits! 842 /// `head` and `tail` can point to the same entry, but if they are 843 /// not exactly equal it implies the ring is full, and if they are 844 /// exactly equal the ring is empty.) 845 uint ring_mask; 846 847 /// value same as SetupParameters.cq_entries, power of 2. 848 uint ring_entries; 849 850 /// incremented by the kernel every time it failed to queue a 851 /// completion event because the ring was full. 852 uint overflow; 853 854 /// Offset to array of completion queue entries 855 uint cqes; 856 857 CQRingFlags flags; /// (available from Linux 5.8) 858 private uint _resv1; 859 private ulong _resv2; 860 } 861 862 /// CompletionQueue ring flags 863 enum CQRingFlags : uint 864 { 865 NONE = 0, /// No flags set 866 867 /// `IORING_CQ_EVENTFD_DISABLED` disable eventfd notifications (available from Linux 5.8) 868 /// This new flag should be set/clear from the application to disable/enable eventfd notifications when a request is completed and queued to the CQ ring. 869 /// 870 /// Before this patch, notifications were always sent if an eventfd is registered, so IORING_CQ_EVENTFD_DISABLED is not set during the initialization. 871 /// It will be up to the application to set the flag after initialization if no notifications are required at the beginning. 872 EVENTFD_DISABLED = 1U << 0, 873 } 874 875 /// io_uring_register(2) opcodes and arguments 876 enum RegisterOpCode : uint 877 { 878 /** 879 * `arg` points to a struct iovec array of nr_args entries. The buffers associated with the 880 * iovecs will be locked in memory and charged against the user's RLIMIT_MEMLOCK resource limit. 881 * See getrlimit(2) for more informa‐ tion. Additionally, there is a size limit of 1GiB per 882 * buffer. Currently, the buffers must be anonymous, non-file-backed memory, such as that 883 * returned by malloc(3) or mmap(2) with the MAP_ANONYMOUS flag set. It is expected that this 884 * limitation will be lifted in the future. Huge pages are supported as well. Note that the 885 * entire huge page will be pinned in the kernel, even if only a portion of it is used. 886 * 887 * After a successful call, the supplied buffers are mapped into the kernel and eligible for 888 * I/O. To make use of them, the application must specify the IORING_OP_READ_FIXED or 889 * IORING_OP_WRITE_FIXED opcodes in the submis‐ sion queue entry (see the struct io_uring_sqe 890 * definition in io_uring_enter(2)), and set the buf_index field to the desired buffer index. 891 * The memory range described by the submission queue entry's addr and len fields must fall 892 * within the indexed buffer. 893 * 894 * It is perfectly valid to setup a large buffer and then only use part of it for an I/O, as 895 * long as the range is within the originally mapped region. 896 * 897 * An application can increase or decrease the size or number of registered buffers by first 898 * unregistering the existing buffers, and then issuing a new call to io_uring_register() with 899 * the new buffers. 900 * 901 * An application need not unregister buffers explicitly before shutting down the io_uring 902 * instance. 903 * 904 * `IORING_REGISTER_BUFFERS` 905 */ 906 REGISTER_BUFFERS = 0, 907 908 /** 909 * This operation takes no argument, and `arg` must be passed as NULL. All previously registered 910 * buffers associated with the io_uring instance will be released. 911 * 912 * `IORING_UNREGISTER_BUFFERS` 913 */ 914 UNREGISTER_BUFFERS = 1, 915 916 /** 917 * Register files for I/O. `arg` contains a pointer to an array of `nr_args` file descriptors 918 * (signed 32 bit integers). 919 * 920 * To make use of the registered files, the IOSQE_FIXED_FILE flag must be set in the flags 921 * member of the struct io_uring_sqe, and the fd member is set to the index of the file in the 922 * file descriptor array. 923 * 924 * Files are automatically unregistered when the io_uring instance is torn down. An application 925 * need only unregister if it wishes to register a new set of fds. 926 * 927 * `IORING_REGISTER_FILES` 928 */ 929 REGISTER_FILES = 2, 930 931 /** 932 * This operation requires no argument, and `arg` must be passed as NULL. All previously 933 * registered files associated with the io_uring instance will be unregistered. 934 * 935 * `IORING_UNREGISTER_FILES` 936 */ 937 UNREGISTER_FILES = 3, 938 939 /** 940 * `IORING_REGISTER_EVENTFD` 941 * 942 * Registers eventfd that would be used to notify about completions on io_uring itself. 943 * 944 * Note: available from Linux 5.2 945 */ 946 REGISTER_EVENTFD = 4, 947 948 /** 949 * `IORING_UNREGISTER_EVENTFD` 950 * 951 * Unregisters previously registered eventfd. 952 * 953 * Note: available from Linux 5.2 954 */ 955 UNREGISTER_EVENTFD = 5, 956 957 /// `IORING_REGISTER_FILES_UPDATE` (from Linux 5.5) 958 REGISTER_FILES_UPDATE = 6, 959 960 /** 961 * `IORING_REGISTER_EVENTFD_ASYNC` (from Linux 5.6) 962 * 963 * If an application is using eventfd notifications with poll to know when new SQEs can be 964 * issued, it's expecting the following read/writes to complete inline. And with that, it knows 965 * that there are events available, and don't want spurious wakeups on the eventfd for those 966 * requests. 967 * 968 * This adds IORING_REGISTER_EVENTFD_ASYNC, which works just like IORING_REGISTER_EVENTFD, 969 * except it only triggers notifications for events that happen from async completions (IRQ, or 970 * io-wq worker completions). Any completions inline from the submission itself will not 971 * trigger notifications. 972 */ 973 REGISTER_EVENTFD_ASYNC = 7, 974 975 /** 976 * `IORING_REGISTER_PROBE` (from Linux 5.6) 977 * 978 * The application currently has no way of knowing if a given opcode is supported or not 979 * without having to try and issue one and see if we get -EINVAL or not. And even this approach 980 * is fraught with peril, as maybe we're getting -EINVAL due to some fields being missing, or 981 * maybe it's just not that easy to issue that particular command without doing some other leg 982 * work in terms of setup first. 983 * 984 * This adds IORING_REGISTER_PROBE, which fills in a structure with info on what it supported 985 * or not. This will work even with sparse opcode fields, which may happen in the future or 986 * even today if someone backports specific features to older kernels. 987 */ 988 REGISTER_PROBE = 8, 989 990 /** 991 * `IORING_REGISTER_PERSONALITY` (from Linux 5.6) 992 * 993 * If an application wants to use a ring with different kinds of credentials, it can register 994 * them upfront. We don't lookup credentials, the credentials of the task calling 995 * IORING_REGISTER_PERSONALITY is used. 996 * 997 * An 'id' is returned for the application to use in subsequent personality support. 998 */ 999 REGISTER_PERSONALITY = 9, 1000 1001 /// `IORING_UNREGISTER_PERSONALITY` (from Linux 5.6) 1002 UNREGISTER_PERSONALITY = 10, 1003 } 1004 1005 /// io_uring_enter(2) flags 1006 enum EnterFlags: uint 1007 { 1008 NONE = 0, 1009 GETEVENTS = (1 << 0), /// `IORING_ENTER_GETEVENTS` 1010 SQ_WAKEUP = (1 << 1), /// `IORING_ENTER_SQ_WAKEUP` 1011 } 1012 1013 /// Time specification as defined in kernel headers (used by TIMEOUT operations) 1014 struct KernelTimespec 1015 { 1016 long tv_sec; /// seconds 1017 long tv_nsec; /// nanoseconds 1018 } 1019 1020 static assert(CompletionEntry.sizeof == 16); 1021 static assert(CompletionQueueRingOffsets.sizeof == 40); 1022 static assert(SetupParameters.sizeof == 120); 1023 static assert(SubmissionEntry.sizeof == 64); 1024 static assert(SubmissionQueueRingOffsets.sizeof == 40); 1025 1026 /// Indicating that OP is supported by the kernel 1027 enum IO_URING_OP_SUPPORTED = 1U << 0; 1028 1029 struct io_uring_probe_op 1030 { 1031 ubyte op; 1032 ubyte resv; 1033 ushort flags; /* IO_URING_OP_* flags */ 1034 uint resv2; 1035 } 1036 1037 struct io_uring_probe 1038 { 1039 ubyte last_op; /* last opcode supported */ 1040 ubyte ops_len; /* length of ops[] array below */ 1041 ushort resv; 1042 uint[3] resv2; 1043 io_uring_probe_op[0] ops; 1044 } 1045 1046 /** 1047 * Setup a context for performing asynchronous I/O. 1048 * 1049 * The `io_uring_setup()` system call sets up a submission queue (SQ) and completion queue (CQ) with 1050 * at least entries entries, and returns a file descriptor which can be used to perform subsequent 1051 * operations on the io_uring instance. The submission and completion queues are shared between 1052 * userspace and the kernel, which eliminates the need to copy data when initiating and completing 1053 * I/O. 1054 * 1055 * See_Also: `io_uring_setup(2)` 1056 * 1057 * Params: 1058 * entries = Defines how many entries can submission queue hold. 1059 * p = `SetupParameters` 1060 * 1061 * Returns: 1062 * `io_uring_setup(2)` returns a new file descriptor on success. The application may then provide 1063 * the file descriptor in a subsequent `mmap(2)` call to map the submission and completion queues, 1064 * or to the `io_uring_register(2)` or `io_uring_enter(2)` system calls. 1065 * 1066 * On error, -1 is returned and `errno` is set appropriately. 1067 */ 1068 int io_uring_setup(uint entries, scope ref SetupParameters p) @trusted 1069 { 1070 pragma(inline); 1071 return syscall(SYS_io_uring_setup, entries, &p); 1072 } 1073 1074 /** 1075 * Initiate and/or complete asynchronous I/O 1076 * 1077 * `io_uring_enter()` is used to initiate and complete I/O using the shared submission and 1078 * completion queues setup by a call to `io_uring_setup(2)`. A single call can both submit new I/O 1079 * and wait for completions of I/O initiated by this call or previous calls to `io_uring_enter()``. 1080 * 1081 * When the system call returns that a certain amount of SQEs have been consumed and submitted, it's 1082 * safe to reuse SQE entries in the ring. This is true even if the actual IO submission had to be 1083 * punted to async context, which means that the SQE may in fact not have been submitted yet. If the 1084 * kernel requires later use of a particular SQE entry, it will have made a private copy of it. 1085 * 1086 * Note: For interrupt driven I/O (where `IORING_SETUP_IOPOLL` was not specified in the call to 1087 * `io_uring_setup(2)`), an application may check the completion queue for event completions without 1088 * entering the kernel at all. 1089 * 1090 * See_Also: `io_uring_enter(2)` 1091 * 1092 * Params: 1093 * fd = the file descriptor returned by io_uring_setup(2). 1094 * to_submit = specifies the number of I/Os to submit from the submission queue. 1095 * min_complete = If the `IORING_ENTER_GETEVENTS` bit is set in flags, then the system call will attempt 1096 * to wait for `min_complete` event completions before returning. If the io_uring instance was configured 1097 * for polling, by specifying IORING_SETUP_IOPOLL in the call to io_uring_setup(2), then 1098 * min_complete has a slightly different meaning. Passing a value of 0 instructs the kernel to 1099 * return any events which are already complete, without blocking. If min_complete is a non-zero 1100 * value, the kernel will still return immediately if any completion events are available. If 1101 * no event completions are available, then the call will poll either until one or more 1102 * completions become available, or until the process has exceeded its scheduler time slice. 1103 * flags = Behavior modification flags - `EnterFlags` 1104 * sig = a pointer to a signal mask (see `sigprocmask(2)`); if sig is not `null`, `io_uring_enter()` 1105 * first replaces the current signal mask by the one pointed to by sig, then waits for events to 1106 * become available in the completion queue, and then restores the original signal mask. The 1107 * following `io_uring_enter()` call: 1108 * 1109 * ``` 1110 * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, &sig); 1111 * ``` 1112 * 1113 * is equivalent to atomically executing the following calls: 1114 * 1115 * ``` 1116 * pthread_sigmask(SIG_SETMASK, &sig, &orig); 1117 * ret = io_uring_enter(fd, 0, 1, IORING_ENTER_GETEVENTS, NULL); 1118 * pthread_sigmask(SIG_SETMASK, &orig, NULL); 1119 * ``` 1120 * 1121 * See the description of `pselect(2)` for an explanation of why the sig parameter is necessary. 1122 * 1123 * Returns: 1124 */ 1125 int io_uring_enter(int fd, uint to_submit, uint min_complete, EnterFlags flags, const sigset_t* sig = null) 1126 { 1127 pragma(inline); 1128 return syscall(SYS_io_uring_enter, fd, to_submit, min_complete, flags, sig, sigset_t.sizeof); 1129 } 1130 1131 /** 1132 * Register files or user buffers for asynchronous I/O. 1133 * 1134 * The `io_uring_register()` system call registers user buffers or files for use in an `io_uring(7)` 1135 * instance referenced by fd. Registering files or user buffers allows the kernel to take long term 1136 * references to internal data structures or create long term mappings of application memory, 1137 * greatly reducing per-I/O overhead. 1138 * 1139 * See_Also: `io_uring_register(2) 1140 * 1141 * Params: 1142 * fd = the file descriptor returned by a call to io_uring_setup(2) 1143 * opcode = code of operation to execute on args 1144 * arg = Args used by specified operation. See `RegisterOpCode` for usage details. 1145 * nr_args = number of provided arguments 1146 * 1147 * Returns: On success, io_uring_register() returns 0. On error, -1 is returned, and errno is set accordingly. 1148 */ 1149 int io_uring_register(int fd, RegisterOpCode opcode, const(void)* arg, uint nr_args) 1150 { 1151 pragma(inline); 1152 return syscall(SYS_io_uring_register, fd, opcode, arg, nr_args); 1153 } 1154 1155 private: 1156 1157 // Syscalls 1158 enum 1159 { 1160 SYS_io_uring_setup = 425, 1161 SYS_io_uring_enter = 426, 1162 SYS_io_uring_register = 427 1163 } 1164 1165 extern (C): 1166 1167 /// Invoke `system call' number `sysno`, passing it the remaining arguments. 1168 int syscall(int sysno, ...);