I find it solves about 80% of server issues with 5% of the effort.
top - 17:13:49 up 83 days, 23:10, 1 user, load average: 433.52, 422.54, 438.70Tasks: 2765 total, 23 running, 1621 sleeping, 0 stopped, 34 zombie%Cpu(s): 23.4 us, 9.5 sy, 0.0 ni, 65.5 id, 0.7 wa, 0.0 hi, 1.0 si, 0.0 st
load average: 433.52, 422.54, 438.70
#dstat -tp----system---- ---procs--- time |run blk new07-03 17:56:50|204 1.0 20207-03 17:56:51|212 0 23807-03 17:56:52|346 1.0 26607-03 17:56:53|279 5.0 26207-03 17:56:54|435 7.0 17707-03 17:56:55|442 3.0 25107-03 17:56:56|792 8.0 41907-03 17:56:57|504 16 15207-03 17:56:58|547 3.0 15607-03 17:56:59|606 2.0 21207-03 17:57:00|770 0 186
#free -g total used free shared buff/cache availableMem: 503 193 7 2 301 301Swap: 0 0 0
更详细的信息可以直接去读/proc/meminfo:
#cat /proc/meminfoMemTotal: 527624224 kBMemFree: 8177852 kBMemAvailable: 316023388 kBBuffers: 23920716 kBCached: 275403332 kBSwapCached: 0 kBActive: 59079772 kBInactive: 431064908 kBActive(anon): 1593580 kBInactive(anon): 191649352 kBActive(file): 57486192 kBInactive(file): 239415556 kBUnevictable: 249700 kBMlocked: 249700 kBSwapTotal: 0 kBSwapFree: 0 kB[...]
#sar -B 1
11:00:16 AM pgscank/s pgscand/s pgsteal/s %vmeff 11:00:17 AM 0.00 0.00 3591.00 0.00 11:00:18 AM 0.00 0.00 10313.00 0.00 11:00:19 AM 0.00 0.00 8452.00 0.00
Node 0 Node 1 Node 2 Node 3 --------------- --------------- --------------- ---------------MemTotal 31511.92 32255.18 32255.18 32255.18MemFree 2738.79 131.89 806.50 10352.02MemUsed 28773.12 32123.29 31448.69 21903.16Active 7580.58 419.80 9597.45 5780.64Inactive 17081.27 26844.28 19806.99 13504.79Active(anon) 6.63 0.93 2.08 5.64Inactive(anon) 12635.75 25560.53 12754.29 9053.80Active(file) 7573.95 418.87 9595.37 5775.00Inactive(file) 4445.52 1283.75 7052.70 4450.98
mkdir /sys/fs/cgroup/cpuset/overloadedecho 0-1 > /sys/fs/cgroup/cpuset/cpuset.cpusecho 0 > /sys/fs/cgroup/cpuset/cpuset.memsecho $$for i in {0..1023}; do /tmp/busy & done
#uptime 14:10:54 up 6 days, 18:52, 10 users, load average: 920.92, 411.61, 166.95
#pidstat -p `pgrep myserv` -t 115:47:05 UID TGID TID %usr %system %guest %CPU CPU Command15:47:06 0 71942 - 415.00 0.00 0.00 415.00 22 myserv15:47:06 0 - 71942 0.00 0.00 0.00 0.00 22 |__myserv...15:47:06 0 - 72079 7.00 94.00 0.00 101.00 21 |__myserv15:47:06 0 - 72080 10.00 90.00 0.00 100.00 19 |__myserv15:47:06 0 - 72081 9.00 91.00 0.00 100.00 35 |__myserv15:47:06 0 - 72082 5.00 95.00 0.00 100.00 29 |__myserv
largest=70
while :; do mem=$(ps -p `pidof mysqld` -o %mem | tail -1) imem=$(printf %.0f $mem) if [ $imem -gt $largest ]; then echo 'p malloc_stats_print(0,0,0)' | gdb --quiet -nx -p `pidof mysqld` fi sleep 10done
#perf list | grep Hardware branch-misses [Hardware event] bus-cycles [Hardware event] cache-misses [Hardware event] cache-references [Hardware event] cpu-cycles OR cycles [Hardware event] instructions [Hardware event] L1-dcache-load-misses [Hardware cache event] L1-dcache-loads [Hardware cache event] L1-dcache-store-misses [Hardware cache event] L1-dcache-stores [Hardware cache event] L1-icache-load-misses [Hardware cache event] L1-icache-loads [Hardware cache event] branch-load-misses [Hardware cache event] branch-loads [Hardware cache event] dTLB-load-misses [Hardware cache event] iTLB-load-misses [Hardware cache event] mem:<addr>[/len][:access] [Hardware breakpoint]
#perf stat -p 31925 sleep 1
Performance counter stats for process id '31925':
2184.986720 task-clock (msec) # 2.180 CPUs utilized 3,210 context-switches # 0.001 M/sec 345 cpu-migrations # 0.158 K/sec 0 page-faults # 0.000 K/sec 4,311,798,055 cycles # 1.973 GHz <not supported> stalled-cycles-frontend <not supported> stalled-cycles-backend 409,465,681 instructions # 0.09 insns per cycle <not supported> branches 8,680,257 branch-misses # 0.00% of all branches
1.002506001 seconds time elapsed
void busy(long us) { struct timeval tv1, tv2; long delta = 0; gettimeofday(&tv1, NULL); do { gettimeofday(&tv2, NULL); delta = (tv2.tv_sec - tv1.tv_sec) * 1000000 + tv2.tv_usec - tv1.tv_usec; } while (delta < us);}
void A() { busy(2000); }void B() { busy(8000); }
int main() { while (1) { A(); B(); } return 0;}
Samples: 27K of event 'cycles', Event count (approx.): 14381317911 Children Self Command Shared Object Symbol+ 99.99% 0.00% a.out [unknown] [.] 0x0000fffffb925137+ 99.99% 0.00% a.out a.out [.] _start+ 99.99% 0.00% a.out libc-2.17.so [.] __libc_start_main+ 99.99% 0.00% a.out a.out [.] main+ 99.06% 25.95% a.out a.out [.] busy+ 79.98% 0.00% a.out a.out [.] B- 71.31% 71.31% a.out [vdso] [.] __kernel_gettimeofday __kernel_gettimeofday - busy + 79.84% B + 20.16% A+ 20.01% 0.00% a.out a.out [.] A
#strace -v perf record -g -e cycles ./a.outperf_event_open({type=PERF_TYPE_HARDWARE, size=PERF_ATTR_SIZE_VER5, config=PERF_COUNT_HW_CPU_CYCLES, sample_freq=4000, sample_type=PERF_SAMPLE_IP|PERF_SAMPLE_TID|PERF_SAMPLE_TIME|PERF_SAMPLE_CALLCHAIN|PERF_SAMPLE_PERIOD, read_format=0, disabled=1, inherit=1, pinned=0, exclusive=0, exclusive_user=0, exclude_kernel=0, exclude_hv=0, exclude_idle=0, mmap=1, comm=1, freq=1, inherit_stat=0, enable_on_exec=1, task=1, watermark=0, precise_ip=0 /* arbitrary skid */, mmap_data=0, sample_id_all=1, exclude_host=0, exclude_guest=1, exclude_callchain_kernel=0, exclude_callchain_user=0, mmap2=1, comm_exec=1, use_clockid=0, context_switch=0, write_backward=0, namespaces=0, wakeup_events=0, config1=0, config2=0, sample_regs_user=0, sample_regs_intr=0, aux_watermark=0, sample_max_stack=0}, 51876, 25, -1, PERF_FLAG_FD_CLOEXEC) = 30
sudo btrace /dev/sda8,0 0 1 0.000000000 1024 A WS 302266328 + 8 <- (8,5) 794357368,0 0 2 0.000001654 1024 Q WS 302266328 + 8 [jbd2/sda5-8]8,0 0 3 0.000010042 1024 G WS 302266328 + 8 [jbd2/sda5-8]8,0 0 4 0.000011605 1024 P N [jbd2/sda5-8]8,0 0 5 0.000014993 1024 I WS 302266328 + 8 [jbd2/sda5-8]8,0 0 0 0.000018026 0 m N cfq1024SN / insert_request8,0 0 0 0.000019598 0 m N cfq1024SN / add_to_rr8,0 0 6 0.000022546 1024 U N [jbd2/sda5-8] 1
$ sudo blktrace -d /dev/sdb -w 5$ blkparse sdb -d sdb.bin$ btt -i sdb.bin==================== All Devices ====================
ALL MIN AVG MAX N--------------- ------------- ------------- ------------- -----------
Q2Q 0.000000001 0.000014397 0.008275391 347303Q2G 0.000000499 0.000071615 0.010518692 347298S2G 0.000128160 0.002107990 0.010517875 11512G2I 0.000000600 0.000001570 0.000040010 347298I2D 0.000000395 0.000000929 0.000003743 347298D2C 0.000116199 0.000144157 0.008443855 347288Q2C 0.000118211 0.000218273 0.010678657 347288
==================== Device Overhead ====================
DEV | Q2G G2I Q2M I2D D2C---------- | --------- --------- --------- --------- --------- ( 8, 16) | 32.8106% 0.7191% 0.0000% 0.4256% 66.0447%---------- | --------- --------- --------- --------- --------- Overall | 32.8106% 0.7191% 0.0000% 0.4256% 66.0447%
二 进阶篇
lat_mem_rd -P 1 -N 1 10240 512
trace-cmd record -p function --func-stack -l generic_make_request dd if=/dev/zero of=file bs=4k count=1 oflag=direct
#trace-cmd reportcpus=128 dd-11344 [104] 4148325.319997: function: generic_make_request dd-11344 [104] 4148325.320002: kernel_stack: <stack trace>=> ftrace_graph_call (ffff00000809849c)=> generic_make_request (ffff000008445b80)=> submit_bio (ffff000008445f00)=> __blockdev_direct_IO (ffff00000835a0a8)=> ext4_direct_IO_write (ffff000001615ff8)=> ext4_direct_IO (ffff0000016164c4)=> generic_file_direct_write (ffff00000825c4e0)=> __generic_file_write_iter (ffff00000825c684)=> ext4_file_write_iter (ffff0000016013b8)=> __vfs_write (ffff00000830c308)=> vfs_write (ffff00000830c564)=> ksys_write (ffff00000830c884)=> __arm64_sys_write (ffff00000830c918)=> el0_svc_common (ffff000008095f38)=> el0_svc_handler (ffff0000080960b0)=> el0_svc (ffff000008084088)
$ sudo trace-cmd record -p function_graph -g generic_make_request dd if=/dev/zero of=file bs=4k count=1 oflag=direct
这样就可以拿到整个调用过程 (report结果稍微整理过):
$ trace-cmd report dd-22961 | generic_make_request() { dd-22961 | generic_make_request_checks() { dd-22961 0.080 us | _cond_resched(); dd-22961 | create_task_io_context() { dd-22961 0.485 us | kmem_cache_alloc_node(); dd-22961 0.042 us | _raw_spin_lock(); dd-22961 0.039 us | _raw_spin_unlock(); dd-22961 1.820 us | } dd-22961 | blk_throtl_bio() { dd-22961 0.302 us | throtl_update_dispatch_stats(); dd-22961 1.748 us | } dd-22961 6.110 us | } dd-22961 | blk_queue_bio() { dd-22961 0.491 us | blk_queue_split(); dd-22961 0.299 us | blk_queue_bounce(); dd-22961 0.200 us | bio_integrity_enabled(); dd-22961 0.183 us | blk_attempt_plug_merge(); dd-22961 0.042 us | _raw_spin_lock_irq(); dd-22961 | elv_merge() { dd-22961 0.176 us | elv_rqhash_find.isra.9(); dd-22961 | deadline_merge() { dd-22961 0.108 us | elv_rb_find(); dd-22961 0.852 us | } dd-22961 2.229 us | } dd-22961 | get_request() { dd-22961 0.130 us | elv_may_queue(); dd-22961 | mempool_alloc() { dd-22961 0.040 us | _cond_resched(); dd-22961 | mempool_alloc_slab() { dd-22961 0.395 us | kmem_cache_alloc(); dd-22961 0.744 us | } dd-22961 1.650 us | } dd-22961 0.334 us | blk_rq_init(); dd-22961 0.055 us | elv_set_request(); dd-22961 4.565 us | } dd-22961 | init_request_from_bio() { dd-22961 | blk_rq_bio_prep() { dd-22961 | blk_recount_segments() { dd-22961 0.222 us | __blk_recalc_rq_segments(); dd-22961 0.653 us | } dd-22961 1.141 us | } dd-22961 1.620 us | } dd-22961 | blk_account_io_start() { dd-22961 0.137 us | disk_map_sector_rcu(); dd-22961 | part_round_stats() { dd-22961 0.195 us | part_round_stats_single(); dd-22961 0.054 us | part_round_stats_single(); dd-22961 0.955 us | } dd-22961 2.148 us | } dd-22961 + 15.847 us | } dd-22961 + 23.642 us | }
[] | main() { [] | A() { 0.160 us [ 69439] | busy(); 1.080 us [ 69439] | } /* A */ [] | B() { 0.050 us [ 69439] | busy(); 0.240 us [ 69439] | } /* B */ 1.720 us [ 69439] | } /* main */
#include <linux/kvm_host.h>
BEGIN { @qemu_pid = $1; @mmio_start = 0xa000a00000; @mmio_end = 0xa000a00000 + 16384; @devid = 1536;}
tracepoint:kvm:kvm_mmio /pid == @qemu_pid/ { if (args->gpa >= @mmio_start && args->gpa < @mmio_end) { @start = nsecs; }}
kprobe:kvm_set_msi { $e = (struct kvm_kernel_irq_routing_entry *)arg0; $kvm = (struct kvm *)arg1; if (@start > 0 && $kvm->userspace_pid == @qemu_pid && $e->msi.devid == @devid) { @dur = stats(nsecs - @start); @start = 0; }}
interval:s:1 { print(@dur); clear(@dur);}
: count 598, average 1606320, total 960579533
: count 543, average 1785906, total 969747196
: count 644, average 1495419, total 963049914
: count 624, average 1546575, total 965062935
: count 645, average 1495250, total 964436299
5 * This file contains the magic bits required to compute the global loadavg 6 * figure. Its a silly number but people think its important. We go through 7 * great pains to make it work on big machines and tickless kernels.
kprobe:calc_load_fold_active /cpu == 0/ { printf("%ld\n", nsecs / 1000000000);}
#include "kernel/sched/sched.h"kprobe:calc_global_load_tick /cpu == 0/ { $rq = (struct rq *)arg0; @[$rq->calc_load_update] = count();}
interval:s:5 { print(@); clear(@);}
#./calc_load.bt -I /kernel-source@[4465886482]: 61@[4465887733]: 1189
@[4465887733]: 62@[4465888984]: 1188
kprobe:id_nr_invalid /cpu == 0/ { printf("%ld\n", nsecs / 1000000000);}
while :; do sec=$(awk -F. '{print $1}' /proc/uptime) rem=$((sec % 5)) if [ $rem -eq 2 ]; then # 1s after updating load break; fi sleep 0.1done
for i in {0..63}; do ./busy 3 & # run 3sdone
top - 09:55:41 up 127 days, 1:44, 1 user, load average: 111.70, 87.08, 79.41Tasks: 3685 total, 21 running, 2977 sleeping, 1 stopped, 8 zombie%Cpu(s): 27.3 us, 8.9 sy, 0.0 ni, 59.8 id, 0.1 wa, 0.0 hi, 4.0 si, 0.0 st
#sar -B 1
11:00:16 AM pgscank/s pgscand/s pgsteal/s %vmeff 11:00:17 AM 0.00 0.00 3591.00 0.00 11:00:18 AM 0.00 0.00 10313.00 0.00 11:00:19 AM 0.00 0.00 8452.00 0.00
(gdb) b read_vmstat_paging(gdb) set follow-fork-mode child(gdb) rBreakpoint 1, read_vmstat_paging (st_paging=0x424f40) at rd_stats.c:751751 if ((fp = fopen(VMSTAT, "r")) == NULL)(gdb) n754 st_paging->pgsteal = 0;(gdb)757 while (fgets(line, sizeof(line), fp) != NULL) {(gdb)759 if (!strncmp(line, "pgpgin ", 7)) {(gdb)763 else if (!strncmp(line, "pgpgout ", 8)) {(gdb)767 else if (!strncmp(line, "pgfault ", 8)) {(gdb)771 else if (!strncmp(line, "pgmajfault ", 11)) {(gdb)775 else if (!strncmp(line, "pgfree ", 7)) {(gdb)779 else if (!strncmp(line, "pgsteal_", 8)) {(gdb)784 else if (!strncmp(line, "pgscan_kswapd", 13)) {(gdb)789 else if (!strncmp(line, "pgscan_direct", 13)) {(gdb)757 while (fgets(line, sizeof(line), fp) != NULL) {(gdb)
#grep pgsteal_ /proc/vmstatpgsteal_kswapd 168563pgsteal_direct 0pgsteal_anon 0pgsteal_file 978205
#grep pgscan_ /proc/vmstatpgscan_kswapd 204242pgscan_direct 0pgscan_direct_throttle 0pgscan_anon 0pgscan_file 50583828
if (current_is_kswapd()) {if (!cgroup_reclaim(sc))__count_vm_events(PGSCAN_KSWAPD, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,nr_scanned);} else {if (!cgroup_reclaim(sc))__count_vm_events(PGSCAN_DIRECT, nr_scanned);count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,nr_scanned);}__count_vm_events(PGSCAN_ANON + file, nr_scanned);
#df -h .Filesystem Size Used Avail Use% Mounted oncgroup 0 0 0 - /sys/fs/cgroup/memory#grep -c 'pgscan\|pgsteal' memory.stat0
[1]https://www.cs.rice.edu/~johnmc/comp528/lecture-notes/
[2]https://brendangregg.com/
[3]http://dtrace.org/blogs/bmc/
[4]https://blog.stgolabs.net/
[5]https://lwn.net/
[6]https://github.com/namhyung/uftrace