一碗阳春面: free & top


free 输出字段的意义


             total       used       free     shared    buffers     cached
Mem:       1028444     991676      36768          0      18080     632156
-/+ buffers/cache:     341440     687004
Swap:            0          0          0

man free 知道利用/proc/meminfo，kernel会向/proc/meminfo写数据，然后free程序从中读取。

Mem: 
total = used + free  这个used包含当前正被使用的 + 现在还没使用的buffers/cached

-/+ buffers/cache: 
这行把buffers/cache视为一个整体，是kernel不采用page cache策略时将得出的数字:
341440表示系统当前正在使用的物理内存 = 991676 - (buffers+cached)
687004表示系统当前真正free的物理内存 = 36768  + (buffers+cached)

buffers+cached = 18080 + 632156 = 650236  一共1GB内存，有650多MB做了page cache。
buffers/cached来源于kernel中的page cache，不管内存多大，kernel经过一段时间都会逐渐用光所有内存，
此时第一行的used接近内存条容量，就是把从没用过的内存都转化成page cache以便随时使用。

page cache 包括普通的page，buffer page，swap cache。
buffers就对应buffer page，有个额外的buffer_head struct来管理，swap cache用来减少I/O。

普通文件要经过filesystem处理，对应cached
不经过filesystem处理的metadata(比如superblock)，对应buffers。
kernel把这两种情况统一处理，都是封装成bio，然后提交给device driver处理。

# rpm -qif /usr/bin/free  知道free源码在procps包中 
procps-3.2.8.tar.gz -> free.c  main():
....
        meminfo();
        printf("             total       used       free     shared    buffers     cached\n");
        printf(
            "%-7s %10Lu %10Lu %10Lu %10Lu %10Lu %10Lu\n", "Mem:",
            S(kb_main_total),
            S(kb_main_used),
            S(kb_main_free),
            S(kb_main_shared),
            S(kb_main_buffers),
            S(kb_main_cached)
        );
...

可见buffers和cached对应kb_main_buffers和kb_main_cached。进入meminfo()发现kb_main_buffers和
kb_main_cached分别对应的name为Buffers和Cached，在meminfo()中从文件/proc/meminfo中读到buffer中
然后解析。

kernel源码在meminfo_proc_show():
可见和Buffers和Cached对应的分别是
i.bufferram 和 global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram

而 si_meminfo(&i) 初始化时，i.bufferram = nr_blockdev_pages()
因此，buffers就是i.bufferam，是直接对blockdevice读写时系统的缓存。
cached就是global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram
其中可能包含普通的page，也可能包含buffer page。

man proc
       /proc/sys/vm/drop_caches (since Linux 2.6.16)
              Writing  to  this  file  causes the kernel to drop clean caches,
              dentries and inodes from memory, causing that memory  to  become
              free.

              To  free  pagecache,  use  echo 1 > /proc/sys/vm/drop_caches; to
              free dentries and inodes, use echo 2 > /proc/sys/vm/drop_caches;
              to   free   pagecache,   dentries  and  inodes,  use  echo  3  >
              /proc/sys/vm/drop_caches.

              Because this is a non-destructive operation  and  dirty  objects
              are not freeable, the user should run sync(8) first. 
-----------------------------------------------------------------------------------------------
top中
< > 按不同列排序，默认按%CPU排序
按 1 显示cpu各core的信息

弄清楚 CODE DATA VIRT RES SHR %MEM  load average 的含义

先看top源码: procps-3.2.8.tar.gz -> vi top.c -> task_show():
// Display information for a single task row.
static void task_show (const WIN_t *q, const proc_t *p)
{
         ...
      switch (i) {
         ...
         case P_COD:
            MKCOL(scale_num(PAGES_TO_KB(p->trs), w, s));
            break;
         ...
         case P_DAT:
            MKCOL(scale_num(PAGES_TO_KB(p->drs), w, s));
            break;
         case P_DRT:
            MKCOL(scale_num((unsigned)p->dt, w, s));
            break;
         ...
         case P_MEM:
            MKCOL((float)PAGES_TO_KB(p->resident) * 100 / kb_main_total);
            break;
         ...
         case P_RES:
            MKCOL(scale_num(PAGES_TO_KB(p->resident), w, s));
            break;
         case P_SHR:
            MKCOL(scale_num(PAGES_TO_KB(p->share), w, s));
            break;
         ...
         case P_VRT:
            MKCOL(scale_num(PAGES_TO_KB(p->size), w, s));
         ...

     } /* end: switch 'procflag' */

进去p->resident由注释知道读取/proc//statm:
    // the next 7 members come from /proc/#/statm
        size,           // statm           total # of pages of memory
        resident,       // statm           number of resident set (non-swapped) pages (4k)
        share,          // statm           number of pages of shared (mmap'd) memory
        trs,            // statm           text resident set size
        lrs,            // statm           shared-lib resident set size
        drs,            // statm           data resident set size
        dt;             // statm           dirty pages

kernel源码，关注fs/proc/目录
# grep -r resident fs/proc/ |less   发现其中的array.c和task_mmu.c 有信息
# vi fs/proc/array.c   搜resident
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
                        struct pid *pid, struct task_struct *task)
{
        int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
        struct mm_struct *mm = get_task_mm(task);

        if (mm) {
                size = task_statm(mm, &shared, &text, &data, &resident);
                mmput(mm);
        }
        seq_printf(m, "%d %d %d %d %d %d %d\n",
                        size, resident, shared, text, lib, data, 0);

        return 0;
}

刚好7个field，很匹配。在看 task_statm()是怎么给这7个field赋值的:
int task_statm(struct mm_struct *mm, int *shared, int *text,
               int *data, int *resident)
{
        *shared = get_mm_counter(mm, file_rss);
        *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT;
        *data = mm->total_vm - mm->shared_vm;
        *resident = *shared + get_mm_counter(mm, anon_rss);
        return mm->total_vm;
}

可见   kernel source code         |  top source code
                                 |
size     ->       total_vm       | -> size     -> VIRT
resident -> (file_rss + anon_rss)| -> resident -> RES
shared   ->       file_rss       | -> share    -> SHR
text -> (end_code - start_code)  | ->  trs     -> CODE
data -> (total_vm - shared_vm)   | ->  drs     -> DATA

%MEM = (RES / 内存条大小)
由于RES包含有SHR，所以各个进程%MEM相加可能超过100%。比如机器内存是100MB，进程A,B各占60MB，
其中50MB是二者共享的，则60% + 60% > 100%。

man top 中有两个公式: VIRT = SWAP + RES ，RES = CODE + DATA

RES = SHR + anon_rss =  file_rss     +     anon_rss
                           |               /      \
                         text            data   stack
                           |               \      /
                         CODE                DATA

2.6.33 Documentation/filesystems/proc.txt 里面有个表：

Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
..............................................................................
 Field    Content
 size     total program size (pages)            (same as VmSize in status)
 resident size of memory portions (pages)       (same as VmRSS in status)
 shared   number of pages that are shared       (i.e. backed by a file)
 trs      number of pages that are 'code'       (not including libs; broken,
                                                        includes data segment)
 lrs      number of pages of library            (always 0 on 2.6)
 drs      number of pages of data/stack         (including libs; broken,
                                                        includes library text)
 dt       number of dirty pages                 (always 0 on 2.6)
..............................................................................

由表trs那行的说明，trs竟然包括data segment，也就是SHR包含了data，而且这样一来共享库也就不能放到
SHR中了。应该是proc文档的错误，如果把trs那行的not 下沉到下面那行，放在includes data segment)
的前面就合理了。


load average 是由procps-3.2.8/tload.c  loadavg()读取 /proc/loadavg 文件得到，所以先man proc

# cat /proc/loadavg
0.29 0.19 0.12 1/208 1961

       /proc/loadavg
              The  first  three  fields  in this file are load average figures
              giving the number of jobs in the run queue (state R) or  waiting
              for disk I/O (state D) averaged over 1, 5, and 15 minutes.  They
              are the same as the load average numbers given by uptime(1)  and
              other  programs.  The fourth field consists of two numbers sepa-
              rated by a slash (/).  The first of these is the number of  cur-
              rently   executing   kernel   scheduling   entities  (processes,
              threads); this will be less than or equal to the number of CPUs.
              The  value  after  the  slash is the number of kernel scheduling
              entities that currently exist on the system.  The fifth field is
              the  PID  of  the  process that was most recently created on the
              system.

可见，load average反映了系统在一段时间内处理的进程数目。而且在Linux下，D进程会被算到load中来。
如果有些进程正在等待磁盘I/O，load就会高。比如本机用samba mount了一个windows分区，看windows上
的movie时 load 就会变高; write USB等慢速media时load也会变高。因此load高时

多cpu情况下，看系统性能时要用 load average / CPUnum

假如是单cpu双核  load average: 4.62, 1.04, 12.08   那表示

过去1min平均每个core要处理 4.62/2 = 2.21个进程，因此要有1.21个要wait。
过去5min平均每个core要处理 1.04/2 = 0.52个进程，因此没有进程wait。
过去15min平均每个core要处理 12.08/2 = 6.04 个进程，要有5.04个进程wait。

一般 (load average / NUMcpu) < 3，系统良好，如果>5就要关注了，load高时可以iostat看一下。
上面数据说明15min之前的任务在5min前已经被处理了，当前load状态良好。

# iostat
avg-cpu:  %user   %nice %system %iowait  %steal   %idle
           6.57    0.00    0.66    0.27    0.00   92.50

Device:            tps   Blk_read/s   Blk_wrtn/s   Blk_read   Blk_wrtn
sda               1.63       115.24        41.20  218568570   78149626

# grep -r load fs/proc/ |less  找到print函数:
static int loadavg_proc_show(struct seq_file *m, void *v)
{
        unsigned long avnrun[3];

        get_avenrun(avnrun, FIXED_1/200, 0);

        seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
                LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
                LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
                LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
                nr_running(), nr_threads,
                task_active_pid_ns(current)->last_pid);
        return 0;
}

为了得到这3个数值，系统要依赖HZ周期性的采样。目前HZ默认是1000，每个tick是1ms，但并不是每次tick
都去sampling，因为那样对scheduler影响太大。采样周期选的大一些，5秒sampling一次，而具体计算之前
还要在延迟10个tick:

/**
 * get_avenrun - get the load average array
 * @loads:      pointer to dest load array
 * @offset:     offset to add
 * @shift:      shift count to shift the result left
 *
 * These values are estimates at best, so no need for locking.
 */
void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
{
        loads[0] = (avenrun[0] + offset) << shift;
        loads[1] = (avenrun[1] + offset) << shift;
        loads[2] = (avenrun[2] + offset) << shift;
}

static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
        load *= exp;
        load += active * (FIXED_1 - exp);
        return load >> FSHIFT;
}

/*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
void calc_global_load(void)
{
        unsigned long upd = calc_load_update + 10;
        long active;

        if (time_before(jiffies, upd))
                return;

        active = atomic_long_read(&calc_load_tasks);
        active = active > 0 ? active * FIXED_1 : 0;

        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
        avenrun[2] = calc_load(avenrun[2], EXP_15, active);

        calc_load_update += LOAD_FREQ;
}

#define LOAD_FREQ       (5*HZ+1)        /* 5 sec intervals */
一碗阳春面

Wednesday, March 17, 2010

free & top

No comments:

Post a Comment