当前位置:网站首页>Thoroughly uncover how epoll realizes IO multiplexing
Thoroughly uncover how epoll realizes IO multiplexing
2022-07-26 16:59:00 【InfoQ】
int main(){
listen(lfd, ...);
cfd1 = accept(...);
cfd2 = accept(...);
efd = epoll_create(...);
epoll_ctl(efd, EPOLL_CTL_ADD, cfd1, ...);
epoll_ctl(efd, EPOLL_CTL_ADD, cfd2, ...);
epoll_wait(efd, ...)
}- epoll_create: Create a epoll object
- epoll_ctl: towards epoll Object to manage
- epoll_wait: Wait for... On the connection it manages IO event
One 、accept Create a new socket


//file: net/socket.c
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
int __user *, upeer_addrlen, int, flags)
{
struct socket *sock, *newsock;
// according to fd Find the listening socket
sock = sockfd_lookup_light(fd, &err, &fput_needed);
//1.1 Request and initialize a new socket
newsock = sock_alloc();
newsock->type = sock->type;
newsock->ops = sock->ops;
//1.2 Apply for a new file object , And set it to the new socket On
newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
......
//1.3 Receiving connection
err = sock->ops->accept(sock, newsock, sock->file->f_flags);
//1.4 Add a new file to the open file list of the current process
fd_install(newfd, newfile);1.1 initialization struct socket object

//file: net/ipv4/af_inet.c
const struct proto_ops inet_stream_ops = {
...
.accept = inet_accept,
.listen = inet_listen,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
...
}1.2 As new socket Object application file

struct file *sock_alloc_file(struct socket *sock, int flags,
const char *dname)
{
struct file *file;
file = alloc_file(&path, FMODE_READ | FMODE_WRITE,
&socket_file_ops);
......
sock->file = file;
}//file: fs/file_table.c
struct file *alloc_file(struct path *path, fmode_t mode,
const struct file_operations *fop)
{
struct file *file;
file->f_op = fop;
......
}//file: net/socket.c
static const struct file_operations socket_file_ops = {
...
.aio_read = sock_aio_read,
.aio_write = sock_aio_write,
.poll = sock_poll,
.release = sock_close,
...
};【 Article Welfare 】 In addition, Xiaobian also sorted out some C++ Back-end development interview questions , Teaching video , Back end learning roadmap for free , You can add what you need :
Q Group :720209036 Click to add ~
Group file sharing
Xiaobian strongly recommends C++ Back end development free learning address :
C/C++Linux Server development senior architect /C++ Background development architect

1.3 Receiving connection
//file: include/linux/net.h
struct socket {
struct file *file;
struct sock *sk;
}//file: net/socket.c
SYSCALL_DEFINE4(accept4, ...)
...
//1.3 Receiving connection
err = sock->ops->accept(sock, newsock, sock->file->f_flags);
}void sock_init_data(struct socket *sock, struct sock *sk)
{
sk->sk_wq = NULL;
sk->sk_data_ready = sock_def_readable;
}1.4 Add a new file to the list of open files in the current process
//file: fs/file.c
void fd_install(unsigned int fd, struct file *file)
{
__fd_install(current->files, fd, file);
}
void __fd_install(struct files_struct *files, unsigned int fd,
struct file *file)
{
...
fdt = files_fdtable(files);
BUG_ON(fdt->fd[fd] != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
}Two 、epoll_create Realization


// file:fs/eventpoll.c
SYSCALL_DEFINE1(epoll_create1, int, flags)
{
struct eventpoll *ep = NULL;
// Create a eventpoll object
error = ep_alloc(&ep);
}// file:fs/eventpoll.c
struct eventpoll {
//sys_epoll_wait Waiting queue used
wait_queue_head_t wq;
// The received descriptors will be put here
struct list_head rdllist;
// Every epoll There is a red black tree in the object
struct rb_root rbr;
......
}- wq:Waiting queue linked list . When the soft interrupt data is ready, it will pass wq To find the blockage epoll Object .
- rbr:A red and black tree . In order to support efficient search of massive connections 、 Insert and delete ,eventpoll A red black tree is used inside . This tree is used to manage all the items added under the user process socket Connect .
- rdllist:Linked list of ready descriptors . When some connections are ready , The kernel will put the ready connection into rdllist In the list . In this way, the application process only needs to judge the linked list to find the ready process , Instead of traversing the whole tree .
//file: fs/eventpoll.c
static int ep_alloc(struct eventpoll **pep)
{
struct eventpoll *ep;
// apply epollevent Memory
ep = kzalloc(sizeof(*ep), GFP_KERNEL);
// Initialize the waiting queue header
init_waitqueue_head(&ep->wq);
// Initialize the ready list
INIT_LIST_HEAD(&ep->rdllist);
// Initialize the red black tree pointer
ep->rbr = RB_ROOT;
......
}3、 ... and 、epoll_ctl add to socket
- 1. Assign a red black tree node object epitem,
- 2. Add wait events to socket Waiting in the queue , Its callback function is ep_poll_callback
- 3. take epitem Insert into epoll In the red and black tree of the object

// file:fs/eventpoll.c
SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
struct epoll_event __user *, event)
{
struct eventpoll *ep;
struct file *file, *tfile;
// according to epfd find eventpoll Kernel object
file = fget(epfd);
ep = file->private_data;
// according to socket Handle number , To find the file Kernel object
tfile = fget(fd);
switch (op) {
case EPOLL_CTL_ADD:
if (!epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_insert(ep, &epds, tfile, fd);
} else
error = -EEXIST;
clear_tfile_check_list();
break;
}//file: fs/eventpoll.c
static int ep_insert(struct eventpoll *ep,
struct epoll_event *event,
struct file *tfile, int fd)
{
//3.1 Assign and initialize epitem
// Allocate one epi object
struct epitem *epi;
if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
return -ENOMEM;
// To assign epi To initialize
//epi->ffd The handle number and struct file Object address
INIT_LIST_HEAD(&epi->pwqlist);
epi->ep = ep;
ep_set_ffd(&epi->ffd, tfile, fd);
//3.2 Set up socket Waiting in line
// Define and initialize ep_pqueue object
struct ep_pqueue epq;
epq.epi = epi;
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
// call ep_ptable_queue_proc Register callback function
// The actual injected function is ep_poll_callback
revents = ep_item_poll(epi, &epq.pt);
......
//3.3 take epi Insert into eventpoll In the red black tree in the object
ep_rbtree_insert(ep, epi);
......
}3.1 Assign and initialize epitem
//file: fs/eventpoll.c
struct epitem {
// Red black tree node
struct rb_node rbn;
//socket File descriptor information
struct epoll_filefd ffd;
// Belong to eventpoll object
struct eventpoll *ep;
// Waiting in line
struct list_head pwqlist;
}
static inline void ep_set_ffd(struct epoll_filefd *ffd,
struct file *file, int fd)
{
ffd->file = file;
ffd->fd = fd;
}3.2 Set up socket Waiting in line

static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt)
{
pt->_key = epi->event.events;
return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events;
}/* No kernel lock held - perfect */
static unsigned int sock_poll(struct file *file, poll_table *wait)
{
...
return sock->ops->poll(file, sock, wait);
}//file: net/ipv4/tcp.c
unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
{
struct sock *sk = sock->sk;
sock_poll_wait(file, sk_sleep(sk), wait);
}//file: include/net/sock.h
static inline wait_queue_head_t *sk_sleep(struct sock *sk)
{
BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0);
return &rcu_dereference_raw(sk->sk_wq)->wait;
}static inline void sock_poll_wait(struct file *filp,
wait_queue_head_t *wait_address, poll_table *p)
{
poll_wait(filp, wait_address, p);
}
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}static int ep_insert(...)
{
...
init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
...
}
//file: include/linux/poll.h
static inline void init_poll_funcptr(poll_table *pt,
poll_queue_proc qproc)
{
pt->_qproc = qproc;
pt->_key = ~0UL; /* all events enabled */
}//file: fs/eventpoll.c
static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
poll_table *pt)
{
struct eppoll_entry *pwq;
f (epi->nwait >= 0 && (pwq = kmem_cache_alloc(pwq_cache, GFP_KERNEL))) {
// init callback
init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
// take ep_poll_callback Put in socket Waiting queue whead( Notice that it's not epoll Waiting queue )
add_wait_queue(whead, &pwq->wait);
}//file:include/linux/wait.h
static inline void init_waitqueue_func_entry(
wait_queue_t *q, wait_queue_func_t func)
{
q->flags = 0;
q->private = NULL;
//ep_poll_callback Sign up to wait_queue_t On the object
// Call when data arrives q->func
q->func = func;
}3.3 Insert the red black tree

Four 、epoll_wait Waiting to receive

//file: fs/eventpoll.c
SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
int, maxevents, int, timeout)
{
...
error = ep_poll(ep, events, maxevents, timeout);
}
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
wait_queue_t wait;
......
fetch_events:
//4.1 Determine whether there are events on the ready queue
if (!ep_events_available(ep)) {
//4.2 Define a wait event and associate it with the current process
init_waitqueue_entry(&wait, current);
//4.3 Put new waitqueue Add to epoll->wq In the list
__add_wait_queue_exclusive(&ep->wq, &wait);
for (;;) {
...
//4.4 Give up CPU Take the initiative to go to sleep
if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
timed_out = 1;
...
}4.1 Determine whether there are events on the ready queue
//file: fs/eventpoll.c
static inline int ep_events_available(struct eventpoll *ep)
{
return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
}4.2 Define a wait event and associate it with the current process
//file: include/linux/wait.h
static inline void init_waitqueue_entry(wait_queue_t *q, struct task_struct *p)
{
q->flags = 0;
q->private = p;
q->func = default_wake_function;
}4.3 Add to waiting queue
static inline void __add_wait_queue_exclusive(wait_queue_head_t *q,
wait_queue_t *wait)
{
wait->flags |= WQ_FLAG_EXCLUSIVE;
__add_wait_queue(q, wait);
}4.4 Give up CPU Take the initiative to go to sleep
//file: kernel/hrtimer.c
int __sched schedule_hrtimeout_range(ktime_t *expires,
unsigned long delta, const enum hrtimer_mode mode)
{
return schedule_hrtimeout_range_clock(
expires, delta, mode, CLOCK_MONOTONIC);
}
int __sched schedule_hrtimeout_range_clock(...)
{
schedule();
...
}//file: kernel/sched/core.c
static void __sched __schedule(void)
{
next = pick_next_task(rq);
...
context_switch(rq, prev, next);
}5、 ... and 、 Here comes the data

- socket->sock->sk_data_ready The set ready processing function is sock_def_readable
- stay socket In the waiting queue item of , Its callback function is ep_poll_callback. In addition, the private It's no use , It points to a null pointer null.
- stay eventpoll In the waiting queue item of , The callback function is default_wake_function. Its private Points to the user process waiting for the event .
5.1 Receive data to task queue
// file: net/ipv4/tcp_ipv4.c
int tcp_v4_rcv(struct sk_buff *skb)
{
......
th = tcp_hdr(skb); // obtain tcp header
iph = ip_hdr(skb); // obtain ip header
// According to the packet header Medium ip、 Port information to find the corresponding socket
sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
......
//socket Not locked by user
if (!sock_owned_by_user(sk)) {
{
if (!tcp_prequeue(sk, skb))
ret = tcp_v4_do_rcv(sk, skb);
}
}
}//file: net/ipv4/tcp_ipv4.c
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
if (sk->sk_state == TCP_ESTABLISHED) {
// Perform data processing in the connected state
if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
rsk = sk;
goto reset;
}
return 0;
}
// Others are not ESTABLISH Packet processing of state
......
}//file: net/ipv4/tcp_input.c
int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, unsigned int len)
{
......
// Receiving data into the queue
eaten = tcp_queue_rcv(sk, skb, tcp_header_len,
&fragstolen);
// data ready, Wake up the socket Blocked processes on the Internet
sk->sk_data_ready(sk, 0);
//file: net/ipv4/tcp_input.c
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int hdrlen,
bool *fragstolen)
{
// Put the received data in socket At the end of the receive queue
if (!eaten) {
__skb_queue_tail(&sk->sk_receive_queue, skb);
skb_set_owner_r(skb, sk);
}
return eaten;
}5.2 Find ready callback function

//file: net/core/sock.c
static void sock_def_readable(struct sock *sk, int len)
{
struct socket_wq *wq;
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
// It's not a good name , It's not a blocked process ,
// Instead, judge that the waiting queue is not empty
if (wq_has_sleeper(wq))
// Execute the callback function on the waiting queue item
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
rcu_read_unlock();
}- wq_has_sleeper, For simple recvfrom System call , It is really to judge whether there is a process blocking . But for epoll Under the socket Just judge that the waiting queue is not empty , There may not be process blocking .
- wake_up_interruptible_sync_poll, It just goes into socket Wait for the callback function set on the queue item , There is not necessarily an operation to wake up the process .
//file: include/linux/wait.h
#define wake_up_interruptible_sync_poll(x, m) \
__wake_up_sync_key((x), TASK_INTERRUPTIBLE, 1, (void *) (m))
//file: kernel/sched/core.c
void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, void *key)
{
...
__wake_up_common(q, mode, nr_exclusive, wake_flags, key);
}static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}5.3 perform socket Ready callback function
//file: fs/eventpoll.c
static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
// obtain wait Corresponding epitem
struct epitem *epi = ep_item_from_wait(wait);
// obtain epitem Corresponding eventpoll Structure
struct eventpoll *ep = epi->ep;
//1. Will the current epitem Add to eventpoll In the ready queue of
list_add_tail(&epi->rdllink, &ep->rdllist);
//2. see eventpoll Whether there is waiting in the waiting queue
if (waitqueue_active(&ep->wq))
wake_up_locked(&ep->wq);
static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
wait_queue_t *curr, *next;
list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
unsigned flags = curr->flags;
if (curr->func(curr, mode, wake_flags, key) &&
(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}5.4 perform epoll Notice of readiness

//file:kernel/sched/core.c
int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
void *key)
{
return try_to_wake_up(curr->private, mode, wake_flags);
}//file: fs/eventpoll.c
static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
int maxevents, long timeout)
{
......
__remove_wait_queue(&ep->wq, &wait);
set_current_state(TASK_RUNNING);
}
check_events:
// Return the ready event to the user process
ep_send_events(ep, events, maxevents))
}summary

- User process kernel state . To call epoll_wait When waiting for a function, the process will fall into the kernel state to execute . This part of the code is responsible for viewing the receive queue , And be responsible for blocking the current process , Give up CPU.
- Hard and soft interrupt context . In these components , Receive the packet from the network card for processing , Then put socket Receive queue for . about epoll Come on , And find socket The associated epitem, And add it to epoll Object in the ready linked list . I'll check it again at this time epoll Whether there are blocked processes on , If there is a wake-up call .
- The illustration | Linux Network packet receiving process
- The illustration | In depth understanding of the stumbling block on the road of high-performance network development - Synchronous blocking network IO
Reference material

边栏推荐
- Operating system migration practice: deploying MySQL database on openeuler
- How to use C language nested linked list to realize student achievement management system
- Tcpdump命令详解
- Set up typera drawing bed
- PyQt5快速开发与实战 3.2 布局管理入门 and 3.3 Qt Designer实战应用
- Application of workflow engine in vivo marketing automation
- About the idea plug-in I wrote that can generate service and mapper with one click (with source code)
- [basic course of flight control development 1] crazy shell · open source formation UAV GPIO (LED flight information light and signal light control)
- mysql锁机制(举例说明)
- Docker install redis? How to configure persistence policy?
猜你喜欢

Tdengine landed in GCL energy technology, with tens of billions of data compressed to 600gb

2022-2023 topic recommendation of information management graduation project

Differences between the use of structs and classes
![[Development Tutorial 9] crazy shell arm function mobile phone-i2c tutorial](/img/9d/2a1deca934e6d56d729922b1d9e515.png)
[Development Tutorial 9] crazy shell arm function mobile phone-i2c tutorial

My SQL is OK. Why is it still so slow? MySQL locking rules

Acl-ijcai-sigir top conference paper report meeting (AIS 2022) Note 3: dialogue and generation
![[fluent -- advanced] packaging](/img/aa/bd6ecad52cbe4a34db75f067aa4dfe.png)
[fluent -- advanced] packaging

【开发教程9】疯壳·ARM功能手机-I2C教程

PXE efficient batch network installation

Win11怎么自动清理回收站?
随机推荐
如何保证缓存和数据库一致性
Why is digital transformation so difficult?!
mysql锁机制(举例说明)
How to ensure cache and database consistency
【开发教程8】疯壳·开源蓝牙心率防水运动手环-三轴计步伐
TCP 和 UDP 可以使用相同端口吗?
理财产品锁定期是什么意思?理财产品在锁定期能赎回吗?
TensorFlow Lite源码解析
NUC 11 build esxi 7.0.3f install network card driver-v2 (upgraded version in July 2022)
Pyqt5 rapid development and practice 3.4 signal and slot correlation
There are six ways to help you deal with the simpledateformat class, which is not a thread safety problem
Final consistency distributed transaction TCC
VS2017打开项目提示需要迁移的解决方法
Singleton mode
Can TCP and UDP use the same port?
Alibaba cloud Toolkit - project one click deployment tool
搭建typora图床
How does win11 automatically clean the recycle bin?
[basic course of flight control development 2] crazy shell · open source formation UAV - timer (LED flight information light and indicator light flash)
如何借助自动化工具落地DevOps|含低代码与DevOps应用实践