Control Flow of the Read() Service read(file-descriptor, buffer_address, buffersize); sys_read(unsigned int fd, char __user * buf, size_t count) fget_light(fd, &fput_needed); *files = current->files; if (atomic_read(&files->count)); See if we are the only one accessing the file fcheck_files(files, fd); Yup, use the open file array to get the flip record pointer *fdt = files_fdtable(files); Get pointer to open file (fd) array file = rcu_dereference(fdt->fd[fd]); Use the fd offset to get the flip pointer <-return to fget_light() with ptr to flip record else rcu_read_lock(); Nope, multiple processes hold this file descriptor fcheck_files(files, fd); Now use the open file array to get the flip record pointer if (atomic_inc_not_zero(&file->f_count)) Make sure count is != 0 and increment rcu_read_unlock(); Let other processes access this file's reference count <-return to sys_read() with filp record pointer file_pos_read(file); Use the flip record to get the R/W file offest value <-return to sys_read() with file->f_pos; vfs_read(file, buf, count, &pos); Read from any type of file system if (!(file->f_mode & FMODE_READ)) Quit if VFS says it cannot be read if (!file->f_op || (!file->f_op->read && !file->f_op->aio_read)) Quit if back-end functions are not present access_ok(VERIFY_WRITE, buf, count); Quit if we cannot WRITE into user's buffer area rw_verify_area(READ, file, pos, count); Quit if parameter sizes are unresonable if (unlikely((ssize_t) count < 0)) Quit if a non-zero read request if (unlikely((pos < 0) || (loff_t) (pos + count) < 0)) Quit if negative values used if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) Do we have to have exclusive access? (IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) setgid bit set, but no group execute bit (otherwise meaningless combination) locks_mandatory_area(read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); Restrict file access to just this process wait_event_interruptible(fl.fl_wait, !fl.fl_next); Sleep while waiting for exclusive access to file <-return to vfs_read() with (count > MAX_RW_COUNT ? MAX_RW_COUNT : count); Truncate read request to a maximum of file size. security_file_permission (file, MAY_READ); Do we have READ permission for this file? mode = file->f_dentry->d_inode->i_mode; Get permission bits from inode if (current->fsuid == inode->i_uid) mode >>= 6; Assume "others" otherwise, promote to owner permission bits else if (in_group_p(inode->i_gid)) mode >>= 3; Assume "others" otherwise, promote to group permission bits if (((mode & mask & MAY_READ == mask)) If mode and MAY_READ bits line up, we have permnission. <-return to vfs_read() with 0; Yup, go ahead and read file if (file->f_op->read) Has the backend jump table been initilaized with a special read function? file->f_op->read(file, buf, count, pos); Then let's use it to read the file Assume EXT2 file system; it uses generic_file_read() in /linux/mm/filemap.c generic_file_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) init_sync_kiocb(&kiocb, filp); Make asynchronous I/O appear to be synchronous I/O __generic_file_aio_read(&kiocb, &local_iov, 1, ppos); Single buff is a subset of vector I/O generic_file_direct_IO(READ, iocb, iov, pos, nr_segs); Check the data cache... filemap_write_and_wait(mapping); filemap_fdatawrite(mapping); i_size_read(mapping->host); Get size of file <-return inode->i_size wait_on_page_writeback_range(mapping, 0, (i_size - 1) >> PAGE_CACHE_SHIFT); pagevec_init(&pvec, 0); pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) wait_on_page_writeback(page); prepare_to_wait(wq, &q->wait, mode); set_current_state(state); <-return to prepare_to_wait() pagevec_release(&pvec); cond_resched(); <-return to wait_on_page_writeback_range() with OK <-return to filemap_fdatawrite() <-return to filemap_write_and_wait() mapping->a_ops->direct_IO(rw, iocb, iov, offset, nr_segs); Assume EXT2 file system; it uses ext2_direct_IO() in /linux/fs/ext2/inode.c ext2_direct_IO(rw, iocb, iov, offset, nr_segs); blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov, offset, nr_segs, ext2_get_block, NULL); __blockdev_direct_IO(rw, iocb, inode, bdev, iov, offset, nr_segs, get_block, end_io, DIO_LOCKING); filemap_write_and_wait_range(mapping, offset, end - 1); <-return to () with <-return to () with <-return to () with <-return to generic_file_direct_IO() with <-return to __generic_file_aio_read() with do_generic_file_read(filp,ppos,&desc,file_read_actor); do_generic_mapping_read(filp->f_mapping, &filp->f_ra, filp, ppos, desc, actor); i_size_read(inode); <-return to do_generic_mapping_read() with size of file cond_resched(); If reading a lot of pages, we may have to suspend for a bit page_cache_readahead(mapping, &ra, filp, index, last_index - index); Request future pages begin to be read now. find_get_page(mapping, index); Get the next page for file read handle_ra_miss(mapping, &ra, index); Oops, cache miss detected mapping_writably_mapped(mapping); If others are writing... flush_dcache_page(page); Then push page out to disk and try again mark_page_accessed(page); Otherwise mark this page accessed actor(desc, page, offset, nr); Get starting point in cache block mapping->a_ops->readpage(filp, page); file_read_actor(desc, page, offset, size); __copy_to_user(desc->arg.buf, kaddr + offset, size); <-return to __copy_to_user() with bytes read <-return to file_read_actor() with bytes read add_to_page_cache_lru(cached_page, mapping, index, GFP_KERNEL); <-return to add_to_page_cache_lru() with <-return to do_generic_mapping_read() with bytes read <-return to do_generic_file_read() with bytes read <-return to generic_file_read() with bytes read file_accessed(filp); <-return to generic_file_read() with number of bytes wait_on_sync_kiocb(&kiocb); <-return to vfs_read() with number of bytes read else do_sync_read(file, buf, count, pos); Otherwise let's use do it the traditonal way... init_sync_kiocb(&kiocb, filp); Fudge up asynchronous I/O to look like synchronous I/O filp->f_op->aio_read(&kiocb, buf, len, kiocb.ki_pos); Try to read the blocks... wait_on_retry_sync_kiocb(&kiocb); May have to try multiple times set_current_state(TASK_UNINTERRUPTIBLE); if (!kiocbIsKicked(iocb)) schedule(); Wait for I/O completion __set_current_state(TASK_RUNNING); <-return to do_sync_read() with RETRY or success wait_on_sync_kiocb(&kiocb); May be able to do it in one try... set_current_state(TASK_UNINTERRUPTIBLE); schedule(); Wait for I/O completion __set_current_state(TASK_RUNNING); <-return to do_sync_read() with ENQUEUED or success <-return to vfs_read() with number of bytes read fsnotify_access(file->f_dentry); current->rchar += ret; <-return to sys_read() with number of bytes read file_pos_write(file, pos); fput_light(file, fput_needed); <-return to read() with number of bytes read