selph
selph
发布于 2024-02-28 / 30 阅读
0
0

[libc 2.35 源码学习] IO_FILE 篇 - fread

简介&前言

这是IO_FILE篇的第二个函数分析:fread,本文分析了fread的流程,fread如何读取文件内容返回,fread中的各种缓冲区的使用:

// 读取缓冲区,base是起始地址,end是结束地址,ptr是指针
  wchar_t *_IO_read_ptr;	/* Current read pointer */
  wchar_t *_IO_read_end;	/* End of get area. */
  wchar_t *_IO_read_base;	/* Start of putback+get area. */

  wchar_t *_IO_buf_base;	/* Start of reserve area. */
  wchar_t *_IO_buf_end;		/* End of reserve area. */

主要流程概括一下大概就是:

  1. 没有缓冲区就申请缓冲区

  2. 缓冲区够用,就从缓冲区里读

  3. 缓冲区不够用,就先把缓冲区里的读了

    1. 然后如果剩余大小超过一个页面,就先把整页面的量通过系统调用读了
    2. 此时缓冲区是满的,调用underflow函数去刷新缓冲区
    3. 此时剩下要读取的大小是一个页面以内,就直接用缓冲区读取了

个人感觉看懂fread的核心是在于sgetn的缓冲区读取,以及underflow函数的缓冲区刷新

简单修改了下测试用代码:

#include<stdio.h>
char buffer[100] = {0};
char strWrite[] = "\npadding content";
int main(){
	FILE* fp = fopen("./test.txt","w+");
	size_t bytesRead = fread(buffer,sizeof(char),sizeof(buffer),fp);
	printf("%s",buffer);
	fwrite(strWrite,sizeof(char),sizeof(strWrite),fp);
	fclose(fp);
	system("cat ./test.txt");
	return 0;
}

源码分析

首先进入了_IO_fread函数

_IO_fread

size_t
_IO_fread(void *buf, size_t size, size_t count, FILE *fp)
{
    // 计算读取大小
    size_t bytes_requested = size * count;
    size_t bytes_read;
    // 检查魔数
    CHECK_FILE(fp, 0);
    // 如果不需要读取内容,就返回0
    if (bytes_requested == 0)
        return 0;
    _IO_acquire_lock(fp);
    // 调用下一层函数
    bytes_read = _IO_sgetn(fp, (char *)buf, bytes_requested);
    _IO_release_lock(fp);
    return bytes_requested == bytes_read ? count : bytes_read / size;
}
libc_hidden_def(_IO_fread)

check_file:debug下检查读取大小和FILE结构的魔数,非debug下则啥也不做:

#ifdef IO_DEBUG
# define CHECK_FILE(FILE, RET) do {				\
    if ((FILE) == NULL						\
	|| ((FILE)->_flags & _IO_MAGIC_MASK) != _IO_MAGIC)	\
      {								\
	__set_errno (EINVAL);					\
	return RET;						\
      }								\
  } while (0)
#else
# define CHECK_FILE(FILE, RET) do { } while (0)
#endif

调用下一层的_IO_sgetn函数

_IO_sgetn

size_t
_IO_sgetn(FILE *fp, void *data, size_t n)
{
	/* FIXME handle putback buffer here! */

	return _IO_XSGETN(fp, data, n);
}
libc_hidden_def(_IO_sgetn)

调用_IO_XSGETN:

#define _IO_XSGETN(FP, DATA, N) JUMP2 (__xsgetn, FP, DATA, N)
#define JUMP2(FUNC, THIS, X1, X2) (_IO_JUMPS_FUNC(THIS)->FUNC) (THIS, X1, X2)

调用fp里的一个虚表函数,参数是2个

#define _IO_XSGETN(FP,DATA,N) JUMP2 (__xsgetn, FP, DATA, N)
Expands to:

((IO_validate_vtable ((*(__typeof__ (((struct _IO_FILE_plus){}).vtable) *)(((char *) ((fp))) + __builtin_offsetof (struct _IO_FILE_plus, vtable)))))->__xsgetn) (fp, data, n)

虚表函数是:__xsgetn

IO_validate_vtable:校验 vtable

动调步进到了这里:调用虚表函数会进行vtable的校验(2.24加入)

/* Check if unknown vtable pointers are permitted; otherwise,
   terminate the process.  */
void _IO_vtable_check (void) attribute_hidden;

/* Perform vtable pointer validation.  If validation fails, terminate
   the process.  */
static inline const struct _IO_jump_t *
IO_validate_vtable (const struct _IO_jump_t *vtable)
{
  /* Fast path: The vtable pointer is within the __libc_IO_vtables
     section.  */
  uintptr_t section_length = __stop___libc_IO_vtables - __start___libc_IO_vtables;
  uintptr_t ptr = (uintptr_t) vtable;
  uintptr_t offset = ptr - (uintptr_t) __start___libc_IO_vtables;
  if (__glibc_unlikely (offset >= section_length))
    /* The vtable pointer is not in the expected section.  Use the
       slow path, which will terminate the process if necessary.  */
    _IO_vtable_check ();
  return vtable;
}

这里是校验vtable的地方:

  1. 计算vtable的长度(大小)
  2. 获取vtable的地址
  3. 计算vtable的地址与vtable开头的距离
  4. 如果超过vtable的长度,就调用_IO_vtable_check ();进一步进行校验

接下来才return到真正需要调用的虚表函数中

_IO_file_xsgetn

从文件读取字节到给定缓冲区

size_t
_IO_file_xsgetn(FILE *fp, void *data, size_t n)
// fp: 指向文件流的指针
// data: 指向目标缓冲区的指针
// n: 要读取的字节数
{
  
    size_t want, have;
    ssize_t count;
    char *s = data;

    want = n;
    // 检查 fp 是否存在缓冲区,没有就分配一个
    if (fp->_IO_buf_base == NULL)
    {
        /* Maybe we already have a push back pointer.  */
        if (fp->_IO_save_base != NULL)
        {
            free(fp->_IO_save_base);
            fp->_flags &= ~_IO_IN_BACKUP;
        }
        // 分配一个
        _IO_doallocbuf(fp);
    }

    // 读取循环,循环到读取完请求的字节数
    while (want > 0)
    {
        // 计算可用空间
        have = fp->_IO_read_end - fp->_IO_read_ptr;
        // 如果可用空间足够
        if (want <= have)
        {
            // 复制数据到缓冲区
            memcpy(s, fp->_IO_read_ptr, want);
            // 更新指针
            fp->_IO_read_ptr += want;
            // 标记读取完成
            want = 0;
        }
        else    // 可用字节数不足
        {
            // 如果还有可用空间
            if (have > 0)
            {
                // 先读取当前可用字节
                s = __mempcpy(s, fp->_IO_read_ptr, have);
                // 更新剩余请求字节
                want -= have;
                // 更新指针
                fp->_IO_read_ptr += have;
            }

            /* Check for backup and repeat */
            // 处理备份数据
            if (_IO_in_backup(fp))
            {
                _IO_switch_to_main_get_area(fp);
                continue;
            }

            /* If we now want less than a buffer, underflow and repeat
               the copy.  Otherwise, _IO_SYSREAD directly to
               the user buffer. */
            // 需要读取的字节数小于缓冲区
            if (fp->_IO_buf_base && want < (size_t)(fp->_IO_buf_end - fp->_IO_buf_base))
            {
                // 填充缓冲区,读取一个页的文件内容存放到了read缓冲区里
                if (__underflow(fp) == EOF)
                    break;
                // 继续循环
                continue;
            }

            // 如果buf缓冲区未初始化或者空间不足

            /* These must be set before the sysread as we might longjmp out
               waiting for input. */
            // 直接从系统读取数据
            // 设置read缓冲区为buf
            _IO_setg(fp, fp->_IO_buf_base, fp->_IO_buf_base, fp->_IO_buf_base);
            // 设置write缓冲区
            _IO_setp(fp, fp->_IO_buf_base, fp->_IO_buf_base);
            // 此时write和read共用buf缓冲古

            /* Try to maintain alignment: read a whole number of blocks.  */
            // 计算系统调用要读取的大小
            count = want;
            if (fp->_IO_buf_base)
            {
                // 计算buf大小
                size_t block_size = fp->_IO_buf_end - fp->_IO_buf_base;
                if (block_size >= 128)
                    count -= want % block_size;
            }
            // 直接从系统调用读取数据,整页的数据系统调用来读
            // #define _IO_SYSREAD(FP, DATA, LEN) JUMP2 (__read, FP, DATA, LEN)
            count = _IO_SYSREAD(fp, s, count);
            // 失败的情况
            if (count <= 0)
            {
                if (count == 0)
                    fp->_flags |= _IO_EOF_SEEN;
                else
                    fp->_flags |= _IO_ERR_SEEN;

                break;
            }
            // 更新大小
            s += count;
            want -= count;
            // 调整文件指针
            if (fp->_offset != _IO_pos_BAD)
                _IO_pos_adjust(fp->_offset, count);
        }
    }
    // 返回读取的大小
    return n - want;
}
libc_hidden_def(_IO_file_xsgetn)

首先是一个判断,是否存在缓冲区_IO_buf_base,如果不存在就_IO_file_doallocate(见下文)申请一个

然后是一个循环进行读取操作:

  • 计算读取缓冲区的可用空间

  • 如果可用空间足够,就直接从读取指针_IO_read_ptr复制数据到目标地址,然后更新指针位置

  • 否则

    • 检查当前读取缓冲区可用空间是否为空,如果不为空,就先读取当前可用字节出来
    • 处理备份数据
    • 然后检查**_IO_buf_base**缓冲区是否满足要求(存在,且buf总空间大于请求大小),如果满足则调用**__underflow**填充缓冲区,填充完了就跳出,没填充完就继续循环
    • 如果请求大小很大,超过了buf缓冲区的总大小,就从系统调用访问文件去读取数据,读整页的数据,最后调整offset指针位置,继续循环

__underflow

int __underflow(FILE *fp)
{
	// 检查是否指向有效的FILE结构,检查是否宽字符支持
	if (_IO_vtable_offset(fp) == 0 && _IO_fwide(fp, -1) != -1)
		return EOF;

	// 未设置就设置宽字符模式
	if (fp->_mode == 0)
		_IO_fwide(fp, -1);

	if (_IO_in_put_mode(fp))
		if (_IO_switch_to_get_mode(fp) == EOF)
			return EOF;
	// 如果read缓冲区还有空间,返回指针
	if (fp->_IO_read_ptr < fp->_IO_read_end)
		return *(unsigned char *)fp->_IO_read_ptr;

	if (_IO_in_backup(fp))
	{
		_IO_switch_to_main_get_area(fp);
		if (fp->_IO_read_ptr < fp->_IO_read_end)
			return *(unsigned char *)fp->_IO_read_ptr;
	}
	if (_IO_have_markers(fp))
	{
		if (save_for_backup(fp, fp->_IO_read_end))
			return EOF;
	}
	else if (_IO_have_backup(fp))
		_IO_free_backup_area(fp);
	// 调用虚函数
	return _IO_UNDERFLOW(fp);
}
libc_hidden_def(__underflow)

重点就两个地方:

  1. 如果read缓冲区有剩余空间,就返回read缓冲区指针
  2. 没有空间走到最后,调用虚函数_IO_UNDERFLOW

此时的fp:

pwndbg> p* fp
$17 = {
  _flags = -72539008,
  _IO_read_ptr = 0x0,
  _IO_read_end = 0x0,
  _IO_read_base = 0x0,
  _IO_write_base = 0x0,
  _IO_write_ptr = 0x0,
  _IO_write_end = 0x0,
  _IO_buf_base = 0x602480 "",
  _IO_buf_end = 0x603480 "",
  _IO_save_base = 0x0,
  _IO_backup_base = 0x0,
  _IO_save_end = 0x0,
  _markers = 0x0,
  _chain = 0x7ffff7faf6a0 <_IO_2_1_stderr_>,
  _fileno = 3,
  _flags2 = 0,
  _old_offset = 0,
  _cur_column = 0,
  _vtable_offset = 0 '\000',
  _shortbuf = "",
  _lock = 0x602380,
  _offset = -1,
  _codecvt = 0x0,
  _wide_data = 0x602390,
  _freeres_list = 0x0,
  _freeres_buf = 0x0,
  __pad5 = 0,
  _mode = -1,
  _unused2 = '\000' <repeats 19 times>
}

fp的各种buf都是0,是空的,所以会走到最后的虚函数的调用:_IO_new_file_underflow

_IO_new_file_underflow

int _IO_new_file_underflow(FILE *fp)
{
    ssize_t count;

    /* C99 requires EOF to be "sticky".  */
    // flags标志的检查
    if (fp->_flags & _IO_EOF_SEEN)
        return EOF;

    if (fp->_flags & _IO_NO_READS)
    {
        fp->_flags |= _IO_ERR_SEEN;
        __set_errno(EBADF);
        return EOF;
    }
    // 再次判断read缓冲区,如果有剩余空间就返回指针
    if (fp->_IO_read_ptr < fp->_IO_read_end)
        return *(unsigned char *)fp->_IO_read_ptr;
  
    // 如果buf缓冲区为空,就申请一个
    if (fp->_IO_buf_base == NULL)
    {
        /* Maybe we already have a push back pointer.  */
        if (fp->_IO_save_base != NULL)
        {
            free(fp->_IO_save_base);
            fp->_flags &= ~_IO_IN_BACKUP;
        }
        _IO_doallocbuf(fp);
    }
    // 设置flags
    /* FIXME This can/should be moved to genops ?? */
    if (fp->_flags & (_IO_LINE_BUF | _IO_UNBUFFERED))
    {
        /* We used to flush all line-buffered stream.  This really isn't
       required by any standard.  My recollection is that
       traditional Unix systems did this for stdout.  stderr better
       not be line buffered.  So we do just that here
       explicitly.  --drepper */
        _IO_acquire_lock(stdout);

        if ((stdout->_flags & (_IO_LINKED | _IO_NO_WRITES | _IO_LINE_BUF)) == (_IO_LINKED | _IO_LINE_BUF))
            _IO_OVERFLOW(stdout, EOF);

        _IO_release_lock(stdout);
    }

    // 切换mode
    _IO_switch_to_get_mode(fp);

    /* This is very tricky. We have to adjust those
       pointers before we call _IO_SYSREAD () since
       we may longjump () out while waiting for
       input. Those pointers may be screwed up. H.J. */
    // 设置read和write缓冲区为buf
    fp->_IO_read_base = fp->_IO_read_ptr = fp->_IO_buf_base;
    fp->_IO_read_end = fp->_IO_buf_base;
    fp->_IO_write_base = fp->_IO_write_ptr = fp->_IO_write_end = fp->_IO_buf_base;
    // 使用系统调用读取内容到buf,读满
    count = _IO_SYSREAD(fp, fp->_IO_buf_base,
                        fp->_IO_buf_end - fp->_IO_buf_base);
    // 读取失败的情况
    if (count <= 0)
    {
        if (count == 0)
            fp->_flags |= _IO_EOF_SEEN;
        else
            fp->_flags |= _IO_ERR_SEEN, count = 0;
    }
    // 扩大read缓冲区大小
    fp->_IO_read_end += count;
    if (count == 0)
    {
        /* If a stream is read to EOF, the calling application may switch active
       handles.  As a result, our offset cache would no longer be valid, so
       unset it.  */
        fp->_offset = _IO_pos_BAD;
        return EOF;
    }
    // 调整文件指针
    if (fp->_offset != _IO_pos_BAD)
        _IO_pos_adjust(fp->_offset, count);
    // 返回read缓冲区指针
    return *(unsigned char *)fp->_IO_read_ptr;
}
libc_hidden_ver(_IO_new_file_underflow, _IO_file_underflow)

这个函数的主要目标:读取文件内容填充缓冲区

主要流程如下:

  1. 设置read和write缓冲区指向buf缓冲区
  2. 使用系统调用read读取文件内容到buf缓冲区,读满
  3. 设置read缓冲区的end读取大小
  4. 返回read缓冲区指针

涉及到的其他虚表函数及相关宏定义

_IO_file_doallocate:申请空间

/* Allocate a file buffer, or switch to unbuffered I/O.  Streams for
   TTY devices default to line buffered.  */
// 申请文件缓冲区,或者切换未缓冲IO,默认是线性缓冲
int _IO_file_doallocate(FILE *fp)
{
    size_t size;
    char *p;
    struct __stat64_t64 st; // 文件状态结构体

    size = BUFSIZ;  // 默认缓冲区大小,512字节
    // 如果有关联文件描述符,且获取文件状态成功,进入
    if (fp->_fileno >= 0 && __builtin_expect(_IO_SYSSTAT(fp, &st), 0) >= 0)
    {
        // 如果是字符设备文件,则进入
        if (S_ISCHR(st.st_mode))
        {
            // 如果是终端
            /* Possibly a tty.  */
            if (
#ifdef DEV_TTY_P
                DEV_TTY_P(&st) ||
#endif
                local_isatty(fp->_fileno))
                // 标记为线性缓冲
                fp->_flags |= _IO_LINE_BUF;
        }
#if defined _STATBUF_ST_BLKSIZE 
        if (st.st_blksize > 0 && st.st_blksize < BUFSIZ)
            size = st.st_blksize;   // 如果支持块大小,则使用块大小
#endif
    }
    p = malloc(size);   // 分配缓冲区
    if (__glibc_unlikely(p == NULL))
        return EOF; // 分配失败
    _IO_setb(fp, p, p + size, 1);   // 设置文件流缓冲区
    return 1;   
}
libc_hidden_def(_IO_file_doallocate)

_IO_in_backup:设置flag

#define _IO_in_backup(fp) ((fp)->_flags & _IO_IN_BACKUP)

_IO_switch_to_main_get_area:交换save和read缓冲区

/* Switch current get area from backup buffer to (start of) main get area. */

void _IO_switch_to_main_get_area(FILE *fp)
{
	// 交换read缓冲区和save缓冲区
	char *tmp;
	// 去掉改标志
	fp->_flags &= ~_IO_IN_BACKUP;
	/* Swap _IO_read_end and _IO_save_end. */
	// 交换 read_end 和 save_end
	tmp = fp->_IO_read_end;
	fp->_IO_read_end = fp->_IO_save_end;
	fp->_IO_save_end = tmp;
	/* Swap _IO_read_base and _IO_save_base. */
	// 交换base
	tmp = fp->_IO_read_base;
	fp->_IO_read_base = fp->_IO_save_base;
	fp->_IO_save_base = tmp;
	/* Set _IO_read_ptr. */
	fp->_IO_read_ptr = fp->_IO_read_base;
}

setg setp:设置read write缓冲区

#define _IO_setg(fp, eb, g, eg)  ((fp)->_IO_read_base = (eb),\
	(fp)->_IO_read_ptr = (g), (fp)->_IO_read_end = (eg))

#define _IO_setp(__fp, __p, __ep) \
       ((__fp)->_IO_write_base = (__fp)->_IO_write_ptr \
	= __p, (__fp)->_IO_write_end = (__ep))

参考资料


评论