Linux select/poll源码剖析
Linux select/poll源码剖析
linux内核版本:2.6.34
在读select、poll源码前,需要先了解的知识点:
- 等待队列
- 文件系统(主要是进程的打开文件描述符表以及struct file)
- poll机制
- 资源注册监听poll() -> poll_wait(struct file *, wait_queue_t *, poll_table *pt) -> pt->qproc(struct file *, wait_queue_t *, poll_table *)
- 资源就绪通知callback_function(wait_queue_t *, unsigned mode, int sync, void *key)
- select/poll主要数据结构
- 一个select()/poll()调用对应一个struct poll_wqueues
- 一个监听事件对应一个struct poll_table_entry
Common
poll机制是所有多路转接的共性;调用控制块(struct poll_wqueues)与监听事件项(struct poll_table_entry)是select()与poll()的共性;事件描述集(fdset)与事件描述符(struct pollfd)是select()与poll()的特性。
/* * Structures and helpers for sys_poll/sys_poll */ struct poll_wqueues { poll_table pt; struct poll_table_page *table; /* * 每次select()都会初始化一个poll_wqueues结构与这个 * 调用相对应。结构中的polling_task字段指向调用进程 * (也就是current进程)的task_struct * * 资源等待队列节点wait_queue_t中的private字段指向 * 对应的poll_wqueues对象。因此资源就绪时,通过获得 * poll_wqueues对象,然后访问其中的polling_task字段 * 能够得到调用进程的PCB,然后对其进行唤醒... */ struct task_struct *polling_task; /* 已触发标记 */ int triggered; int error; int inline_index; /* poll_table_entry结构缓存数组 */ struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES]; }; typedef struct poll_table_struct { poll_queue_proc qproc; unsigned long key; } poll_table; struct poll_table_page { struct poll_table_page * next; /* 指向下一可用的poll_table_entry结构 */ struct poll_table_entry * entry; struct poll_table_entry entries[0]; }; /* 真正被挂载到资源等待队列中的结构 */ struct poll_table_entry { /* 链接到资源文件 */ struct file *filp; /* 存储监听的事件 */ unsigned long key; /* * 挂载到资源等待队列的节点,其中包含了唤醒回调函数 * * 节点的private成员关联poll_table_entry所属的poll_wqueues, */ wait_queue_t wait; /* 指向资源等待队列队列头 */ wait_queue_head_t *wait_address; };
void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->qproc = qproc; pt->key = ~0UL; /* all events enabled */ } /* Add a new entry */ /** * __pollwait - 将poll_table_entry挂载到资源文件的监听队列 * @file: 被监听的资源文件 * @wait_address: 被监听的资源文件的等待队列头 * @p: 在poll_initwait()中设置的poll_tbale */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { /* 获取poll_wqueues */ struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); /* 从poll_wqueues中取得一个poll_table_entry结构 */ struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; /* 增加资源文件引用计数 */ get_file(filp); /* 关联资源文件 */ entry->filp = filp; /* 保存资源文件监听队列队列头 */ entry->wait_address = wait_address; /* 设置想要监听事件 */ entry->key = p->key; /* * 初始化一个等待队列节点,其中唤醒函数设置为pollwake * * 重点!!!: * 唤醒函数为pollwake */ init_waitqueue_func_entry(&entry->wait, pollwake); /* * 来看一下为什么等待队列节点的private要这样设计: * 1. 实际linux内核设计: * 每个wait_queue_t的private字段指向同一个poll_wqueues,然后 * 共用的poll_wqueues中保存了指向调用进程PCB的指针,这样总共 * 需要n + 1个指针... * 2. 假想设计: * 每个wait_queue_t的private字段指向调用进程PCB,对应的事件 * 结构poll_table_entry中每个都保存了指向同一个poll_wqueues * 的指针,这样总共需要n + n个指针... */ entry->wait.private = pwq; /* 将poll_table_entry挂载到资源文件的监听队列中 */ add_wait_queue(wait_address, &entry->wait); } static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { struct poll_table_page *table = p->table; /* 缓存数组够用就从缓存数组中分配... */ if (p->inline_index < N_INLINE_POLL_ENTRIES) return p->inline_entries + p->inline_index++; /* 动态分配的内存为空或者已用完... */ if (!table || POLL_TABLE_FULL(table)) { struct poll_table_page *new_table; /* 分配一页的内存给poll_table_page使用 */ new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; return NULL; } new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; table = new_table; } return table->entry++; } void poll_freewait(struct poll_wqueues *pwq) { struct poll_table_page * p = pwq->table; int i; /* 对缓存数组中的poll_table_entry进行卸载 */ for (i = 0; i < pwq->inline_index; i++) free_poll_entry(pwq->inline_entries + i); /* 对动态内存中的poll_table_entry进行卸载 */ while (p) { struct poll_table_entry * entry; struct poll_table_page *old; entry = p->entry; do { /* 一个poll_table_page对象中至少分配了一个 * poll_table_entry,所以entry--是安全的 */ entry--; free_poll_entry(entry); } while (entry > p->entries); old = p; p = p->next; free_page((unsigned long) old); } } static void free_poll_entry(struct poll_table_entry *entry) { remove_wait_queue(entry->wait_address, &entry->wait); fput(entry->filp); } /** * pollwake - 唤醒回调函数,这个函数验证资源当前状态中是否有我们所关心的 * 事件,如果没有,就忽略这次唤醒;如果有,就转调用__pollwake... * @wait: poll_table_entry.wait * @mode: * @key: 携带资源当前状态 */ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct poll_table_entry *entry; /* 通过poll_wqueues中的wait获取poll_table_entry */ entry = container_of(wait, struct poll_table_entry, wait); /* 如果资源的已就绪的状态中没有我们所关心的events的话,直接忽略返回 */ if (key && !((unsigned long)key & entry->key)) return 0; /* 有我们所关心的events,那就转调用__pollwake去处理吧... */ return __pollwake(wait, mode, sync, key); } /* 资源就绪时真正调用的唤醒回调函数 */ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) { /* 见__pollwait()中关于等待队列节点private字段的注释 */ struct poll_wqueues *pwq = wait->private; /* 构造一个有效的等待队列节点,private字段指向调用进程的PCB */ DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); /* * Although this function is called under waitqueue lock, LOCK * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() * and is paired with set_mb() in poll_schedule_timeout. */ smp_wmb(); /* 设置已触发状态标志 */ pwq->triggered = 1; /* * Perform the default wake up operation using a dummy * waitqueue. * * TODO: This is hacky but there currently is no interface to * pass in @sync. @sync is scheduled to be removed and once * that happens, wake_up_process() can be used directly. */ /* 唤醒select()的调用进程 */ return default_wake_function(&dummy_wait, mode, sync, key); }
select
调用链:
sys_select() -> core_sys_select() -> do_select() -> 1. poll_initwait() -> 2. f_op->poll() [ -> poll_wait() -> __pollwait() -> poll_get_entry() ] -> 3. [ block -> goto 2 ] -> 4. poll_freewait() -> free_poll_entry() pollwake() -> __pollwake()
typedef __kernel_fd_set fd_set; #define __NFDBITS (8 * sizeof(unsigned long)) #define __FD_SETSIZE 1024 #define __FDSET_LONGS (__FD_SETSIZE/__NFDBITS) /* fd_set中能够表示的最大文件描述符为__FD_SETSIZE - 1 */ typedef struct { unsigned long fds_bits [__FDSET_LONGS]; } __kernel_fd_set; /* * Scaleable version of the fd_set. */ typedef struct { unsigned long *in, *out, *ex; unsigned long *res_in, *res_out, *res_ex; } fd_set_bits; /* * How many longwords for "nr" bits? */ /* 一个long包含的bit数 */ #define FDS_BITPERLONG (8*sizeof(long)) /* nr个bit需要多少long才能装下 */ #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) /* nr个bit需要多大的long数组才能装下 */ #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long))
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, fd_set __user *, exp, struct timeval __user *, tvp) { struct timespec end_time, *to = NULL; struct timeval tv; int ret; /* * 1. 永久等待 tvp == NULL * 2. 根本不等待 tvp->tv_sec == 0 && tvp->tc_nsec == 0 * 3. 等待指定时间 tvp->tv_sec != 0 || tvp->tc_nsec != 0 */ if (tvp) { /* 将超时时间从用户空间拷贝到内核空间 */ if (copy_from_user(&tv, tvp, sizeof(tv))) return -EFAULT; to = &end_time; /* 处理超时时间 */ if (poll_select_set_timeout(to, tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } /* * 处理完超时时间之后: * 1. 永久等待 to == NULL * 2. 根本不等待 to->ts_sec == 0 && to->ts_nsec == 0 * 3. 等待指定时间 to指向的timespec结构表示绝对超时时间 */ /* 主线(至此,我们只进行了超时时间处理) */ ret = core_sys_select(n, inp, outp, exp, to); /* 将剩余超时时间通过tvp指向的timeval结构返回给用户空间 */ ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); return ret; } int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec *end_time) { fd_set_bits fds; void *bits; int ret, max_fds; unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ /* * 我们需要将用户空间传进来的inset、outset、exset拷贝到内核空间,并且 * 需要等容量的空间来存储结果集,之后会将结果集的内容写回到用户空间。 * 我们先在栈上分配一块缓冲区,用于缓存输入集以及结果集,如果缓存的 * 空间大小不够,那么再使用kmalloc()动态分配,优先使用栈缓存而不用动态 * 内存可以加快访问... */ long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ /* rdlock加锁:保护struct files的访问 */ rcu_read_lock(); fdt = files_fdtable(current->files); max_fds = fdt->max_fds; rcu_read_unlock(); /* * 根据当前文件描述符表能表示的最大文件描述符对输入参数n进行修正 * * fdtable.maxfd的意义: * 在expand_fdtable() -> alloc_fdtable()中有以下代码: * fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); * fdt->max_fds = nr; * data = alloc_fdmem(nr * sizeof(struct file *)); * fdt->fd = (struct file **)data; */ if (n > max_fds) n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of * long-words. */ /* * n个bits至少需要size个long才能装下(之后我们使用long表示bits段) * 为了存储输入集与结果集,我们需要6*size个long的存储空间 * 如果我们在栈上分配的那个缓冲区够用,那么就用它;而如果空间 * 容纳不下的话,那么我们只好kmalloc()动态分配内存了... */ size = FDS_BYTES(n); bits = stack_fds; if (size > sizeof(stack_fds) / 6) { /* Not enough space in on-stack array; must use kmalloc */ ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; } /* * 将数组划分为6份... * inset、outset、exset * res_inset、res_outset、res_exset */ fds.in = bits; fds.out = bits + size; fds.ex = bits + 2*size; fds.res_in = bits + 3*size; fds.res_out = bits + 4*size; fds.res_ex = bits + 5*size; /* 从用户空间将输入集拷贝到内核空间 */ if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; /* 将结果集清0 */ zero_fd_set(n, fds.res_in); zero_fd_set(n, fds.res_out); zero_fd_set(n, fds.res_ex); /* 主线(至此,我们只进行了描述符集处理) */ ret = do_select(n, &fds, end_time); if (ret < 0) goto out; if (!ret) { ret = -ERESTARTNOHAND; if (signal_pending(current)) goto out; ret = 0; } /* 最后...将结果集写给用户空间 */ if (set_fd_set(n, inp, fds.res_in) || set_fd_set(n, outp, fds.res_out) || set_fd_set(n, exp, fds.res_ex)) ret = -EFAULT; out: if (bits != stack_fds) kfree(bits); out_nofds: return ret; } /** * do_select - select()核心函数 * @n: select()的第一个参数 * @fd_set_bits: core_sys_select()处理的描述符集 * @end_time: 绝对超时时间 */ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; poll_table *wait; /* timed_out指示是否已经超时,超时1,未超时0 */ int retval, i, timed_out = 0; unsigned long slack = 0; /* 根据输入集以及当前文件描述符打开情况对n修正 */ rcu_read_lock(); retval = max_select_fd(n, fds); rcu_read_unlock(); if (retval < 0) return retval; n = retval; /* ------------------------------------------------------------- */ /* ---------------------------- main --------------------------- */ /* ------------------------------------------------------------- */ /* * 注意: * poll_table被封装在了poll_wqueues结构体中,以便之后向资源 * 注册监听的时候,能够用poll_table得到对应的poll_wqueues * * 初始化poll_wqueues * 1. 初始化poll_wqueues中的poll_table: * * 设置监听注册函数为__pollwait * * 设置想要监听的事件为所有事件(没必要,之后会修改) * 2. 设置polling_task指向当前进程PCB * * 重点:资源注册函数为__pollwait */ poll_initwait(&table); wait = &table.pt; /* 根本不等待 */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { /* 注意:wait设为NULL了!!! */ wait = NULL; /* 还没开始就已经超时,这样就实现了根本不等待... */ timed_out = 1; } /* 重新估算相对超时时间... */ if (end_time && !timed_out) slack = estimate_accuracy(end_time); retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; /* 通过i遍历[0, n)范围内的文件描述符 */ for (i = 0; i < n; ++rinp, ++routp, ++rexp) { unsigned long in, out, ex, all_bits, bit = 1, mask, j; unsigned long res_in = 0, res_out = 0, res_ex = 0; const struct file_operations *f_op = NULL; struct file *file = NULL; in = *inp++; out = *outp++; ex = *exp++; all_bits = in | out | ex; /* 跳过我们不关心的bits段 */ if (all_bits == 0) { i += __NFDBITS; continue; } /* 遍历bits段中的每一个bit */ for (j = 0; j < __NFDBITS; ++j, ++i, bit <<= 1) { int fput_needed; /* 超出了关心的文件描述符范围[0, n),那么跳出... */ if (i >= n) break; /* 跳过我们不关心的bit */ if (!(bit & all_bits)) continue; /* 通过current->files->fdt->fd[i]获得当前进程中描述符i * 对应文件的struct file...同时,增加struct file的引用 * 计数,防止在获取struct file之后,它被异步删除... */ file = fget_light(i, &fput_needed); /* 因为没有rdlock加锁,因此当前进程中描述符i对应的文件可能已经 * 被异步关闭。这就是为什么需要判断file是否为空的原因... */ if (file) { f_op = file->f_op; /* 注意: * mask = POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM; */ mask = DEFAULT_POLLMASK; /* * 如果这个文件支持poll(),那么我们就向这个文件 * 注册监听函数;如果不支持,那么我们就忽略掉这 * 个文件描述符... * * 还有三种特殊情况: * 1. 我们将超时时间设置为根本不等待 * 2. 已经发生了就绪事件 * 3. 已经注册了监听函数 * 这三种情况下,wait都为NULL,此时poll()只是 * 简单地返回资源文件当前状态... */ if (f_op && f_op->poll) { /* 设置poll_table中想要监听的事件 */ wait_key_set(wait, in, out, bit); /* 对文件注册监听函数,并返回资源的当前状态 */ mask = (*f_op->poll)(file, wait); } fput_light(file, fput_needed); /* * 如果文件不支持poll()的话,那么mask总为DEFAULT_POLLMASK, * 表示当前资源对于任何事件都就绪... * 对于不支持poll()的文件,主要是ext2/ext3/ext4等块设备上 * 的文件,这些文件总是对任意事件就绪的,读写操作不会引起 * 阻塞,有数据就返回数据,没有数据就返回0表示end-of-file */ /* events验证,其中retval表示就绪的资源数 */ if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; /* 发生了就绪事件后就不再对资源进行注册了... * 因为我们遍历完这次就可以返回了! */ wait = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; wait = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; wait = NULL; } } } // end of for (j = 0; j < __NFDBITS; ++j) /* 写出结果,注意写出的目的地是传进来的fd_set_bits */ if (res_in) *rinp = res_in; if (res_out) *routp = res_out; if (res_ex) *rexp = res_ex; /* 提供一个调度时机,给资源就绪更多的准备时间... */ cond_resched(); } // end of for (i = 0; i < n; ) /* 将wait设为NULL,表示我们不希望再进行监听注册... */ wait = NULL; /* * 1. 有事件发生了(retval) * 2. 超时了(timed_out) * 3. 发生了中断(signal_pending(current)) * 任意事件发生了都跳出死循环... */ /* * 感觉这里存在问题: * e.g. * 假设我们监听文件描述符集#1#2#3#4,poll(#1)与poll(#2)时 * 向资源文件注册监听函数并返回0 events,poll(#3)时向资源 * 文件注册监听函数并返回可读events,然后wait被置NULL, * poll(#4)时由于wait为NULL,所以只是返回#4的当前文件状态。 * 遍历描述符集完毕,retval != 0,此时跳出循环,返回用户 * 空间#3与#4的文件状态。但是在处理后续文件描述符的时候, * #1和#2可能异步就绪,但是我们没有返回它们... */ if (retval || timed_out || signal_pending(current)) break; /* 发生了错误,我们也跳出死循环... */ if (table.error) { retval = table.error; break; } /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec_to_ktime(*end_time); to = &expire; } /* 能够到达这一步就说明没有发生就绪、中断以及超时 */ /* * 判断poll_wqueues是否已触发,如果还没有触发,那就设置 * 当前运行状态为可中断阻塞并进行睡眠,等待被唤醒... * 被唤醒之后重新进行迭代,获取资源就绪情况... * 在向资源注册监听与判断poll_wqueues是否已触发这段时间 * 内,可能资源异步就绪了,如果没有触发标志,那么可能就 * 会丢失资源就绪这个事件,可能导致select()永久沉睡... * 这就是为什么需要poll_wqueues.triggered字段的原因... */ if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } // end of for ( ; ; ) /* * 1. 卸载安装到资源监听队列上的poll_table_entry * 2. 释放poll_wqueues占用的资源 */ poll_freewait(&table); return retval; } #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) #define BITS(fds, n) (*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n)) static int max_select_fd(unsigned long n, fd_set_bits *fds) { unsigned long *open_fds; unsigned long set; int max; struct fdtable *fdt; /* handle last in-complete long-word first */ /* * set = ~(1111....11111111 << (n % __NFDBITS)) * e.g. * n = 16, set = (unsigned long)0000....00000000B * n = 18, set = (unsigned long)0000....00000011B * n = 23, set = (unsigned long)0000....01111111B */ set = ~(~0UL << (n & (__NFDBITS-1))); n /= __NFDBITS; fdt = files_fdtable(current->files); open_fds = fdt->open_fds->fds_bits+n; max = 0; /* 处理最后一个不完整的bits段 */ if (set) { set &= BITS(fds, n); if (set) { /* 有效性验证: * 输入集中不能含有未打开的文件描述符 */ if (!(set & ~*open_fds)) goto get_max; return -EBADF; } } /* 循环处理完整的bits段 */ while (n) { open_fds--; n--; set = BITS(fds, n); if (!set) continue; /* 有效性验证: * 输入集中不能含有未打开的文件描述符 */ if (set & ~*open_fds) return -EBADF; /* 判断max是否被设置而continue,而不是根据max是否被设置 * 而break,是想对fd_set中设置的所有文件描述符都进行有效 * 性验证 */ if (max) continue; get_max: /* max会是__NFDBITS的整数倍 */ do { max++; set >>= 1; } while (set); max += n * __NFDBITS; } return max; } #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR) #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit) { if (wait) { wait->key = POLLEX_SET; if (in & bit) wait->key |= POLLIN_SET; if (out & bit) wait->key |= POLLOUT_SET; } }
poll
调用链:
sys_poll() -> do_sys_poll() -> poll_initwait() -> do_poll() -> 1. do_pollfd() -> f_op->poll() [ -> poll_wait() -> __pollwait() -> poll_get_entry() ] -> 2. [ block -> goto 1 ] -> poll_freewait() -> free_poll_entry() pollwake() -> __pollwake()
struct pollfd { /* 想要监听的文件描述符 */ int fd; /* 关心的events */ short events; /* 就绪的events */ short revents; }; struct poll_list { struct poll_list *next; int len; struct pollfd entries[0]; };
SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, long, timeout_msecs) { struct timespec end_time, *to = NULL; int ret; if (timeout_msecs >= 0) { to = &end_time; poll_select_set_timeout(to, timeout_msecs / MSEC_PER_SEC, NSEC_PER_MSEC * (timeout_msecs % MSEC_PER_SEC)); } /* 主线(至此,我们只进行了超时时间处理) */ ret = do_sys_poll(ufds, nfds, to); /* ... */ if (ret == -EINTR) { struct restart_block *restart_block; restart_block = ¤t_thread_info()->restart_block; restart_block->fn = do_restart_poll; restart_block->poll.ufds = ufds; restart_block->poll.nfds = nfds; if (timeout_msecs >= 0) { restart_block->poll.tv_sec = end_time.tv_sec; restart_block->poll.tv_nsec = end_time.tv_nsec; restart_block->poll.has_timeout = 1; } else restart_block->poll.has_timeout = 0; ret = -ERESTART_RESTARTBLOCK; } return ret; } int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec *end_time) { struct poll_wqueues table; int err = -EFAULT, fdcount, len, size; /* Allocate small arguments on the stack to save memory and be faster - use long to make sure the buffer is aligned properly on 64 bit archs to avoid unaligned access */ /* poll_list缓存数组 */ long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *const head = (struct poll_list *)stack_pps; struct poll_list *walk = head; unsigned long todo = nfds; /* 输入验证 */ if (nfds > rlimit(RLIMIT_NOFILE)) return -EINVAL; /* 将用户空间中的pollfd拷贝到内核空间 */ len = min_t(unsigned int, nfds, N_STACK_PPS); for (;;) { walk->next = NULL; walk->len = len; if (!len) break; if (copy_from_user(walk->entries, ufds + nfds-todo, sizeof(struct pollfd) * walk->len)) goto out_fds; todo -= walk->len; if (!todo) break; len = min(todo, POLLFD_PER_PAGE); size = sizeof(struct poll_list) + sizeof(struct pollfd) * len; walk = walk->next = kmalloc(size, GFP_KERNEL); if (!walk) { err = -ENOMEM; goto out_fds; } } poll_initwait(&table); /* 主线(至此,我们只进行了描述符集处理) */ fdcount = do_poll(nfds, head, &table, end_time); poll_freewait(&table); /* 将结果写给用户空间 */ for (walk = head; walk; walk = walk->next) { struct pollfd *fds = walk->entries; int j; for (j = 0; j < walk->len; j++, ufds++) if (__put_user(fds[j].revents, &ufds->revents)) goto out_fds; } err = fdcount; out_fds: walk = head->next; while (walk) { struct poll_list *pos = walk; walk = walk->next; kfree(pos); } return err; } static int do_poll(unsigned int nfds, struct poll_list *list, struct poll_wqueues *wait, struct timespec *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; unsigned long slack = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { pt = NULL; timed_out = 1; } if (end_time && !timed_out) slack = estimate_accuracy(end_time); for (;;) { struct poll_list *walk; /* 遍历poll_list */ for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; /* 遍历pollfd */ pfd = walk->entries; pfd_end = pfd + walk->len; for (; pfd != pfd_end; pfd++) { /* * Fish for events. If we found one, record it * and kill the poll_table, so we don't * needlessly register any other waiters after * this. They'll get immediately deregistered * when we break out and return. */ /* * 对每一个想要监听的pollfd调用do_pollfd() * 返回非0表示监听的事件已发生... */ if (do_pollfd(pfd, pt)) { count++; pt = NULL; } } } /* * All waiters have already been registered, so don't provide * a poll_table to them on the next loop iteration. */ pt = NULL; if (!count) { count = wait->error; if (signal_pending(current)) count = -EINTR; } if (count || timed_out) break; /* * If this is the first loop and we have a timeout * given, then we convert to ktime_t and set the to * pointer to the expiry value. */ if (end_time && !to) { expire = timespec_to_ktime(*end_time); to = &expire; } if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } return count; } /* * Fish for pollable events on the pollfd->fd file descriptor. We're only * interested in events matching the pollfd->events mask, and the result * matching that mask is both recorded in pollfd->revents and returned. The * pwait poll_table will be used by the fd-provided poll handler for waiting, * if non-NULL. */ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) { unsigned int mask; int fd; mask = 0; fd = pollfd->fd; if (fd >= 0) { int fput_needed; struct file * file; file = fget_light(fd, &fput_needed); mask = POLLNVAL; if (file != NULL) { mask = DEFAULT_POLLMASK; if (file->f_op && file->f_op->poll) { if (pwait) /* poll()总是设置events中的POLLERR与POLLHUP */ pwait->key = pollfd->events | POLLERR | POLLHUP; mask = file->f_op->poll(file, pwait); } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; fput_light(file, fput_needed); } } pollfd->revents = mask; return mask; }
关于惊群:select()惊群与poll()惊群目前无法等到解决,因为它无法像accept()那样使得每次只有一个调用者能挂载到资源的监听队列上...虽然epoll()已经较好的解决了惊群,但只限于同一个epoll实例的ET模式。
另一篇epoll源码剖析:https://www.nowcoder.com/discuss/79616
#Linux#