free 输出字段的意义
total used free shared buffers cached
Mem: 1028444 991676 36768 0 18080 632156
-/+ buffers/cache: 341440 687004
Swap: 0 0 0
man free 知道利用/proc/meminfo,kernel会向/proc/meminfo写数据,然后free程序从中读取。
Mem:
total = used + free 这个used包含当前正被使用的 + 现在还没使用的buffers/cached
-/+ buffers/cache:
这行把buffers/cache视为一个整体,是kernel不采用page cache策略时将得出的数字:
341440表示系统当前正在使用的物理内存 = 991676 - (buffers+cached)
687004表示系统当前真正free的物理内存 = 36768 + (buffers+cached)
buffers+cached = 18080 + 632156 = 650236 一共1GB内存,有650多MB做了page cache。
buffers/cached来源于kernel中的page cache,不管内存多大,kernel经过一段时间都会逐渐用光所有内存,
此时第一行的used接近内存条容量,就是把从没用过的内存都转化成page cache以便随时使用。
page cache 包括普通的page,buffer page,swap cache。
buffers就对应buffer page,有个额外的buffer_head struct来管理,swap cache用来减少I/O。
普通文件要经过filesystem处理,对应cached
不经过filesystem处理的metadata(比如superblock),对应buffers。
kernel把这两种情况统一处理,都是封装成bio,然后提交给device driver处理。
# rpm -qif /usr/bin/free 知道free源码在procps包中
procps-3.2.8.tar.gz -> free.c main():
....
meminfo();
printf(" total used free shared buffers cached\n");
printf(
"%-7s %10Lu %10Lu %10Lu %10Lu %10Lu %10Lu\n", "Mem:",
S(kb_main_total),
S(kb_main_used),
S(kb_main_free),
S(kb_main_shared),
S(kb_main_buffers),
S(kb_main_cached)
);
...
可见buffers和cached对应kb_main_buffers和kb_main_cached。进入meminfo()发现kb_main_buffers和
kb_main_cached分别对应的name为Buffers和Cached,在meminfo()中从文件/proc/meminfo中读到buffer中
然后解析。
kernel源码在meminfo_proc_show():
可见和Buffers和Cached对应的分别是
i.bufferram 和 global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram
而 si_meminfo(&i) 初始化时,i.bufferram = nr_blockdev_pages()
因此,buffers就是i.bufferam,是直接对blockdevice读写时系统的缓存。
cached就是global_page_state(NR_FILE_PAGES) - total_swapcache_pages - i.bufferram
其中可能包含普通的page,也可能包含buffer page。
man proc
/proc/sys/vm/drop_caches (since Linux 2.6.16)
Writing to this file causes the kernel to drop clean caches,
dentries and inodes from memory, causing that memory to become
free.
To free pagecache, use echo 1 > /proc/sys/vm/drop_caches; to
free dentries and inodes, use echo 2 > /proc/sys/vm/drop_caches;
to free pagecache, dentries and inodes, use echo 3 >
/proc/sys/vm/drop_caches.
Because this is a non-destructive operation and dirty objects
are not freeable, the user should run sync(8) first.
-----------------------------------------------------------------------------------------------
top中
< > 按不同列排序,默认按%CPU排序
按 1 显示cpu各core的信息
弄清楚 CODE DATA VIRT RES SHR %MEM load average 的含义
先看top源码: procps-3.2.8.tar.gz -> vi top.c -> task_show():
// Display information for a single task row.
static void task_show (const WIN_t *q, const proc_t *p)
{
...
switch (i) {
...
case P_COD:
MKCOL(scale_num(PAGES_TO_KB(p->trs), w, s));
break;
...
case P_DAT:
MKCOL(scale_num(PAGES_TO_KB(p->drs), w, s));
break;
case P_DRT:
MKCOL(scale_num((unsigned)p->dt, w, s));
break;
...
case P_MEM:
MKCOL((float)PAGES_TO_KB(p->resident) * 100 / kb_main_total);
break;
...
case P_RES:
MKCOL(scale_num(PAGES_TO_KB(p->resident), w, s));
break;
case P_SHR:
MKCOL(scale_num(PAGES_TO_KB(p->share), w, s));
break;
...
case P_VRT:
MKCOL(scale_num(PAGES_TO_KB(p->size), w, s));
...
} /* end: switch 'procflag' */
进去p->resident由注释知道读取/proc//statm:
// the next 7 members come from /proc/#/statm
size, // statm total # of pages of memory
resident, // statm number of resident set (non-swapped) pages (4k)
share, // statm number of pages of shared (mmap'd) memory
trs, // statm text resident set size
lrs, // statm shared-lib resident set size
drs, // statm data resident set size
dt; // statm dirty pages
kernel源码,关注fs/proc/目录
# grep -r resident fs/proc/ |less 发现其中的array.c和task_mmu.c 有信息
# vi fs/proc/array.c 搜resident
int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0;
struct mm_struct *mm = get_task_mm(task);
if (mm) {
size = task_statm(mm, &shared, &text, &data, &resident);
mmput(mm);
}
seq_printf(m, "%d %d %d %d %d %d %d\n",
size, resident, shared, text, lib, data, 0);
return 0;
}
刚好7个field,很匹配。在看 task_statm()是怎么给这7个field赋值的:
int task_statm(struct mm_struct *mm, int *shared, int *text,
int *data, int *resident)
{
*shared = get_mm_counter(mm, file_rss);
*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT;
*data = mm->total_vm - mm->shared_vm;
*resident = *shared + get_mm_counter(mm, anon_rss);
return mm->total_vm;
}
可见 kernel source code | top source code
|
size -> total_vm | -> size -> VIRT
resident -> (file_rss + anon_rss)| -> resident -> RES
shared -> file_rss | -> share -> SHR
text -> (end_code - start_code) | -> trs -> CODE
data -> (total_vm - shared_vm) | -> drs -> DATA
%MEM = (RES / 内存条大小)
由于RES包含有SHR,所以各个进程%MEM相加可能超过100%。比如机器内存是100MB,进程A,B各占60MB,
其中50MB是二者共享的,则60% + 60% > 100%。
man top 中有两个公式: VIRT = SWAP + RES ,RES = CODE + DATA
RES = SHR + anon_rss = file_rss + anon_rss
| / \
text data stack
| \ /
CODE DATA
2.6.33 Documentation/filesystems/proc.txt 里面有个表:
Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
..............................................................................
Field Content
size total program size (pages) (same as VmSize in status)
resident size of memory portions (pages) (same as VmRSS in status)
shared number of pages that are shared (i.e. backed by a file)
trs number of pages that are 'code' (not including libs; broken,
includes data segment)
lrs number of pages of library (always 0 on 2.6)
drs number of pages of data/stack (including libs; broken,
includes library text)
dt number of dirty pages (always 0 on 2.6)
..............................................................................
由表trs那行的说明,trs竟然包括data segment,也就是SHR包含了data,而且这样一来共享库也就不能放到
SHR中了。应该是proc文档的错误,如果把trs那行的not 下沉到下面那行,放在includes data segment)
的前面就合理了。
load average 是由procps-3.2.8/tload.c loadavg()读取 /proc/loadavg 文件得到,所以先man proc
# cat /proc/loadavg
0.29 0.19 0.12 1/208 1961
/proc/loadavg
The first three fields in this file are load average figures
giving the number of jobs in the run queue (state R) or waiting
for disk I/O (state D) averaged over 1, 5, and 15 minutes. They
are the same as the load average numbers given by uptime(1) and
other programs. The fourth field consists of two numbers sepa-
rated by a slash (/). The first of these is the number of cur-
rently executing kernel scheduling entities (processes,
threads); this will be less than or equal to the number of CPUs.
The value after the slash is the number of kernel scheduling
entities that currently exist on the system. The fifth field is
the PID of the process that was most recently created on the
system.
可见,load average反映了系统在一段时间内处理的进程数目。而且在Linux下,D进程会被算到load中来。
如果有些进程正在等待磁盘I/O,load就会高。比如本机用samba mount了一个windows分区,看windows上
的movie时 load 就会变高; write USB等慢速media时load也会变高。因此load高时
多cpu情况下,看系统性能时要用 load average / CPUnum
假如是单cpu双核 load average: 4.62, 1.04, 12.08 那表示
过去1min平均每个core要处理 4.62/2 = 2.21个进程,因此要有1.21个要wait。
过去5min平均每个core要处理 1.04/2 = 0.52个进程,因此没有进程wait。
过去15min平均每个core要处理 12.08/2 = 6.04 个进程,要有5.04个进程wait。
一般 (load average / NUMcpu) < 3,系统良好,如果>5就要关注了,load高时可以iostat看一下。
上面数据说明15min之前的任务在5min前已经被处理了,当前load状态良好。
# iostat
avg-cpu: %user %nice %system %iowait %steal %idle
6.57 0.00 0.66 0.27 0.00 92.50
Device: tps Blk_read/s Blk_wrtn/s Blk_read Blk_wrtn
sda 1.63 115.24 41.20 218568570 78149626
# grep -r load fs/proc/ |less 找到print函数:
static int loadavg_proc_show(struct seq_file *m, void *v)
{
unsigned long avnrun[3];
get_avenrun(avnrun, FIXED_1/200, 0);
seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
nr_running(), nr_threads,
task_active_pid_ns(current)->last_pid);
return 0;
}
为了得到这3个数值,系统要依赖HZ周期性的采样。目前HZ默认是1000,每个tick是1ms,但并不是每次tick
都去sampling,因为那样对scheduler影响太大。采样周期选的大一些,5秒sampling一次,而具体计算之前
还要在延迟10个tick:
/**
* get_avenrun - get the load average array
* @loads: pointer to dest load array
* @offset: offset to add
* @shift: shift count to shift the result left
*
* These values are estimates at best, so no need for locking.
*/
void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
{
loads[0] = (avenrun[0] + offset) << shift;
loads[1] = (avenrun[1] + offset) << shift;
loads[2] = (avenrun[2] + offset) << shift;
}
static unsigned long
calc_load(unsigned long load, unsigned long exp, unsigned long active)
{
load *= exp;
load += active * (FIXED_1 - exp);
return load >> FSHIFT;
}
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
void calc_global_load(void)
{
unsigned long upd = calc_load_update + 10;
long active;
if (time_before(jiffies, upd))
return;
active = atomic_long_read(&calc_load_tasks);
active = active > 0 ? active * FIXED_1 : 0;
avenrun[0] = calc_load(avenrun[0], EXP_1, active);
avenrun[1] = calc_load(avenrun[1], EXP_5, active);
avenrun[2] = calc_load(avenrun[2], EXP_15, active);
calc_load_update += LOAD_FREQ;
}
#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */
Wednesday, March 17, 2010
free & top
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment