# cat mysql-configmap.yaml //mysql配置文件放入: configmap apiVersion: v1 kind: ConfigMap metadata: name: mysql labels: app: mysql data: master.cnf: | # Apply this config only on the master. [mysqld] log-bin
mysqld.cnf: | [mysqld] pid-file = /var/run/mysqld/mysqld.pid socket = /var/run/mysqld/mysqld.sock datadir = /var/lib/mysql #log-error = /var/log/mysql/error.log # By default we only accept connections from localhost #bind-address = 127.0.0.1 # Disabling symbolic-links is recommended to prevent assorted security risks symbolic-links=0 sql_mode='STRICT_TRANS_TABLES,NO_ZERO_IN_DATE,NO_ZERO_DATE,ERROR_FOR_DIVISION_BY_ZERO,NO_AUTO_CREATE_USER,NO_ENGINE_SUBSTITUTION' # 慢查询阈值,查询时间超过阈值时写入到慢日志中 long_query_time = 2 innodb_buffer_pool_size = 257M
slave.cnf: | # Apply this config only on slaves. [mysqld] super-read-only
#cat $JWT_TOKEN_DEFAULT_DEFAULT eyJhbGciOiJSUzI1NiIsImtpZCI6ImlNVVFVNmxUM2t4c3Y2Q3IyT1BzV2hDZGRVSmVxTHc5RV8wUXZ4RVM5REEifQ.eyJpc3MiOiJrdWJlcm5ldGVzL3NlcnZpY2VhY2NvdW50Iiwia3ViZXJuZXRlcy5pby9zZXJ2aWNlYWNjb3VudC9uYW1lc3BhY2UiOiJkZWZhdWx0Iiwia3ViZXJ: File name too long
//打开 coredump $gdb /opt/taobao/java/bin/java core.24086 [New LWP 27184] [New LWP 27186] [New LWP 24086] [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Core was generated by `/opt/tt/java_coroutine/bin/java'. #0 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 Missing separate debuginfos, use: debuginfo-install jdk-8.9.14-20200203164153.alios7.x86_64 (gdb) info threads //查看所有thread Id Target Id Frame 583 Thread 0x7f2fa56177c0 (LWP 24086) 0x00007f2fa4fab017 in pthread_join () from /lib64/libpthread.so.0 582 Thread 0x7f2f695f3700 (LWP 27186) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 581 Thread 0x7f2f6cbfb700 (LWP 27184) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 580 Thread 0x7f2f691ef700 (LWP 27176) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 579 Thread 0x7f2f698f6700 (LWP 27174) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
(gdb) thread apply all bt //查看所有线程堆栈 Thread 583 (Thread 0x7f2fa56177c0 (LWP 24086)): #0 0x00007f2fa4fab017 in pthread_join () from /lib64/libpthread.so.0 #1 0x00007f2fa4b85085 in ContinueInNewThread0 (continuation=continuation@entry=0x7f2fa4b7fd70 <JavaMain>, stack_size=1048576, args=args@entry=0x7ffe529432d0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/solaris/bin/java_md_solinux.c:1044 #2 0x00007f2fa4b81877 in ContinueInNewThread (ifn=ifn@entry=0x7ffe529433d0, threadStackSize=<optimized out>, argc=<optimized out>, argv=0x7f2fa3c163a8, mode=mode@entry=1, what=what@entry=0x7ffe5294be17 "com.taobao.tddl.server.TddlLauncher", ret=0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/share/bin/java.c:2033 #3 0x00007f2fa4b8513b in JVMInit (ifn=ifn@entry=0x7ffe529433d0, threadStackSize=<optimized out>, argc=<optimized out>, argv=<optimized out>, mode=mode@entry=1, what=what@entry=0x7ffe5294be17 "com.taobao.tddl.server.TddlLauncher", ret=ret@entry=0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/solaris/bin/java_md_solinux.c:1091 #4 0x00007f2fa4b8254d in JLI_Launch (argc=0, argv=0x7f2fa3c163a8, jargc=<optimized out>, jargv=<optimized out>, appclassc=1, appclassv=0x0, fullversion=0x400885 "1.8.0_232-b604", dotversion=0x400881 "1.8", pname=0x40087c "java", lname=0x40087c "java", javaargs=0 '\000', cpwildcard=1 '\001', javaw=0 '\000', ergo=0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/share/bin/java.c:304 #5 0x0000000000400635 in main ()
Thread 582 (Thread 0x7f2f695f3700 (LWP 27186)): #0 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f2fa342d863 in Parker::park(bool, long) () from /opt/taobao/install/ajdk-8_9_14-b604/jre/lib/amd64/server/libjvm.so #2 0x00007f2fa35ba3c3 in Unsafe_Park () from /opt/taobao/install/ajdk-8_9_14-b604/jre/lib/amd64/server/libjvm.so #3 0x00007f2f9343b44a in ?? () #4 0x000000008082e778 in ?? () #5 0x0000000000000003 in ?? () #6 0x00007f2f88e32758 in ?? () #7 0x00007f2f6f532800 in ?? ()
Attaching to core core.24086 from executable /opt/taobao/java/bin/java, please wait... Debugger attached successfully. Server compiler detected. JVM version is 25.232-b604 Deadlock Detection:
//打开 coredump $gdb /opt/taobao/java/bin/java core.24086 [New LWP 27184] [New LWP 27186] [New LWP 24086] [Thread debugging using libthread_db enabled] Using host libthread_db library "/lib64/libthread_db.so.1". Core was generated by `/opt/tt/java_coroutine/bin/java'. #0 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 Missing separate debuginfos, use: debuginfo-install jdk-8.9.14-20200203164153.alios7.x86_64 (gdb) info threads //查看所有thread Id Target Id Frame 583 Thread 0x7f2fa56177c0 (LWP 24086) 0x00007f2fa4fab017 in pthread_join () from /lib64/libpthread.so.0 582 Thread 0x7f2f695f3700 (LWP 27186) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 581 Thread 0x7f2f6cbfb700 (LWP 27184) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 580 Thread 0x7f2f691ef700 (LWP 27176) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 579 Thread 0x7f2f698f6700 (LWP 27174) 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
(gdb) thread apply all bt //查看所有线程堆栈 Thread 583 (Thread 0x7f2fa56177c0 (LWP 24086)): #0 0x00007f2fa4fab017 in pthread_join () from /lib64/libpthread.so.0 #1 0x00007f2fa4b85085 in ContinueInNewThread0 (continuation=continuation@entry=0x7f2fa4b7fd70 <JavaMain>, stack_size=1048576, args=args@entry=0x7ffe529432d0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/solaris/bin/java_md_solinux.c:1044 #2 0x00007f2fa4b81877 in ContinueInNewThread (ifn=ifn@entry=0x7ffe529433d0, threadStackSize=<optimized out>, argc=<optimized out>, argv=0x7f2fa3c163a8, mode=mode@entry=1, what=what@entry=0x7ffe5294be17 "com.taobao.tddl.server.TddlLauncher", ret=0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/share/bin/java.c:2033 #3 0x00007f2fa4b8513b in JVMInit (ifn=ifn@entry=0x7ffe529433d0, threadStackSize=<optimized out>, argc=<optimized out>, argv=<optimized out>, mode=mode@entry=1, what=what@entry=0x7ffe5294be17 "com.taobao.tddl.server.TddlLauncher", ret=ret@entry=0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/solaris/bin/java_md_solinux.c:1091 #4 0x00007f2fa4b8254d in JLI_Launch (argc=0, argv=0x7f2fa3c163a8, jargc=<optimized out>, jargv=<optimized out>, appclassc=1, appclassv=0x0, fullversion=0x400885 "1.8.0_232-b604", dotversion=0x400881 "1.8", pname=0x40087c "java", lname=0x40087c "java", javaargs=0 '\000', cpwildcard=1 '\001', javaw=0 '\000', ergo=0) at /ssd1/jenkins_home/workspace/ajdk.8.build.master/jdk/src/share/bin/java.c:304 #5 0x0000000000400635 in main ()
Thread 582 (Thread 0x7f2f695f3700 (LWP 27186)): #0 0x00007f2fa4fada35 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0 #1 0x00007f2fa342d863 in Parker::park(bool, long) () from /opt/taobao/install/ajdk-8_9_14-b604/jre/lib/amd64/server/libjvm.so #2 0x00007f2fa35ba3c3 in Unsafe_Park () from /opt/taobao/install/ajdk-8_9_14-b604/jre/lib/amd64/server/libjvm.so #3 0x00007f2f9343b44a in ?? () #4 0x000000008082e778 in ?? () #5 0x0000000000000003 in ?? () #6 0x00007f2f88e32758 in ?? () #7 0x00007f2f6f532800 in ?? ()
Attaching to core core.24086 from executable /opt/taobao/java/bin/java, please wait... Debugger attached successfully. Server compiler detected. JVM version is 25.232-b604 Deadlock Detection:
$cat /etc/pam.d/system-auth #%PAM-1.0 # This file is auto-generated. # User changes will be destroyed the next time authconfig is run. auth required pam_env.so auth required pam_faildelay.so delay=2000000 auth sufficient pam_unix.so nullok try_first_pass auth requisite pam_succeed_if.so uid >= 1000 quiet_success auth required pam_deny.so
If dmesg does not show any information about NUMA, then increase the Ring Buffer size: Boot with ‘log_buf_len=16M’ (or some other big value). Refer the following kbase article How do I increase the kernel log ring buffer size? for steps on how to increase the ring buffer
Please install a package which provides this module, or verify that the module is installed correctly.
It's possible that the above module doesn't match the current version of Python, which is: 2.6.6 (r266:84292, Sep 4 2013, 07:46:00) [GCC 4.4.7 20120313 (Red Hat 4.4.7-3)]
If you cannot solve this problem yourself, please go to the yum faq at: http://yum.baseurl.org/wiki/Faq
Check and fix the related library paths or remove 3rd party libraries, usually libcurl or libssh2. On a x86_64 system, the standard paths for those libraries are /usr/lib64/libcurl.so.4 and /usr/lib64/libssh2.so.1
PROCESS STATE CODES Here are the different values that the s, stat and state output specifiers(header "STAT" or "S") will display to describe the state of a process: D uninterruptible sleep (usually IO) #不可中断睡眠 不接受任何信号,因此kill对它无效,一般是磁盘io,网络io读写时出现 R running or runnable (on run queue) #可运行状态或者运行中,可运行状态表明进程所需要的资源准备就绪,待内核调度 S interruptible sleep (waiting for an event to complete) #可中断睡眠,等待某事件到来而进入睡眠状态 T stopped by job control signal #进程暂停状态 平常按下的ctrl+z,实际上是给进程发了SIGTSTP 信号 (kill -l可查看系统所有的信号量) t stopped by debugger during the tracing #进程被ltrace、strace attach后就是这种状态 W paging (not valid since the 2.6.xx kernel) #没有用了 X dead (should never be seen) #进程退出时的状态 Z defunct ("zombie") process, terminated but not reaped by its parent #进程退出后父进程没有正常回收,俗称僵尸进程
Fedora:基于Red Hat Linux,在Red Hat Linux终止发行后,红帽公司计划以Fedora来取代Red Hat Linux在个人领域的应用,而另外发行的Red Hat Enterprise Linux取代Red Hat Linux在商业应用的领域。Fedora的功能对于用户而言,它是一套功能完备、更新快速的免费操作系统,而对赞助者Red Hat公司而言,它是许多新技术的测试平台,被认为可用的技术最终会加入到Red Hat Enterprise Linux中。Fedora大约每六个月发布新版本。
阿里云上默认买到的ALinux2 OS(4.19),同样配置跑起来后,tps只有16000,比2.6.32的22000差了不少,心里只能暗暗骂几句坑爹的货,看了下各项指标,看不出来什么问题,就像是CPU能力不行一样。如果这个时候直接找内核同学,估计他们心里会说 X 是个什么东西?是不是你们测试有问题,是不是你们配置的问题,不要来坑我,内核性能我们每次发布都在实验室里跑过了,肯定是你们的应用问题。
$~/tools/async-profiler/profiler.sh -e lock -d 5 1560
--- 1687260767618 ns (100.00%), 91083 samples
[ 0] ch.qos.logback.classic.sift.SiftingAppender
[ 1] ch.qos.logback.core.AppenderBase.doAppend
[ 2] ch.qos.logback.core.spi.AppenderAttachableImpl.appendLoopOnAppenders
[ 3] ch.qos.logback.classic.Logger.appendLoopOnAppenders
[ 4] ch.qos.logback.classic.Logger.callAppenders
[ 5] ch.qos.logback.classic.Logger.buildLoggingEventAndAppend
[ 6] ch.qos.logback.classic.Logger.filterAndLog_0_Or3Plus
[ 7] ch.qos.logback.classic.Logger.info
[ 8] com.*****.logger.slf4j.Slf4jLogger.info
[ 9] com.*****.utils.logger.support.FailsafeLogger.info
[10] com.*****.util.LogUtils.recordSql
"ServerExecutor-3-thread-480" #753 daemon prio=5 os_prio=0 tid=0x00007f8265842000 nid=0x26f1 waiting for monitor entry [0x00007f82270bf000]
java.lang.Thread.State: BLOCKED (on object monitor)
at ch.qos.logback.core.AppenderBase.doAppend(AppenderBase.java:64)
- waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender)
at ch.qos.logback.core.spi.AppenderAttachableImpl.appendLoopOnAppenders(AppenderAttachableImpl.java:48)
at ch.qos.logback.classic.Logger.appendLoopOnAppenders(Logger.java:282)
at ch.qos.logback.classic.Logger.callAppenders(Logger.java:269)
at ch.qos.logback.classic.Logger.buildLoggingEventAndAppend(Logger.java:470)
at ch.qos.logback.classic.Logger.filterAndLog_0_Or3Plus(Logger.java:424)
at ch.qos.logback.classic.Logger.info(Logger.java:628)
at com.****.utils.logger.slf4j.Slf4jLogger.info(Slf4jLogger.java:42)
at com.****.utils.logger.support.FailsafeLogger.info(FailsafeLogger.java:102)
at com.****.util.LogUtils.recordSql(LogUtils.java:115)
ns percent samples top
---------- ------- ------- ---
160442633302 99.99% 38366 ch.qos.logback.classic.sift.SiftingAppender
12480081 0.01% 19 java.util.Properties
3059572 0.00% 9 com.***.$$$.common.IdGenerator
244394 0.00% 1 java.lang.Object
堆栈中也可以看到大量的:
- waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - locked <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender)
现场DBA通过监控看到MySQL CPU不到20%,没有慢查询,并且尝试用client越过所有中间环节直接压其中一个MySQL,可以将 MySQL CPU 跑满,这时的QPS大概是38000(对应上面的场景client QPS为700的时候,单个MySQL上的QPS才跑到6000) 所以排除了MySQL的嫌疑(这个推理不够严谨为后面排查埋下了大坑)。
多线程竞争锁的时候,加锁失败的线程会“忙等待”,直到它拿到锁。什么叫“忙等待”呢?它并不意味着一直执行 CAS 函数,而是会与 CPU 紧密配合 ,它通过 CPU 提供的 PAUSE 指令,减少循环等待时的cache ping-pong和耗电量;对于单核 CPU,忙等待并没有意义,此时它会主动把线程休眠。
X86 PAUSE 指令
X86设计了Pause指令,也就是调用 Pause 指令的代码会抢着 CPU 不释放,但是CPU 会打个盹,比如 10个时钟周期,相对一次上下文切换是大几千个时钟周期。
这样应用一旦自旋抢锁失败可以先 Pause 一下,只是这个Pause 时间对于 MySQL 来说还不够久,所以需要增加参数 innodb_spin_wait_delay 来将休息时间放大一些。
节能(CPU可以休息、但是不让出来),CPU Pause 的时候你从 top 能看到 CPU 100%,但是不耗能。
所以有了 Pause 指令后能够提高超线程的利用率,节能,减少上下文切换提高自旋锁的效率。
The PAUSE instruction is first introduced for Intel Pentium 4 processor to improve the performance of “spin-wait loop”. The PAUSE instruction is typically used with software threads executing on two logical processors located in the same processor core, waiting for a lock to be released. Such short wait loops tend to last between tens and a few hundreds of cycles. When the wait loop is expected to last for thousands of cycles or more, it is preferable to yield to the operating system by calling one of the OS synchronization API functions, such as WaitForSingleObject on Windows OS.
An Intel® processor suffers a severe performance penalty when exiting the loop because it detects a possible memory order violation. The PAUSE instruction provides a hint to the processor that the code sequence is a spin-wait loop. The processor uses this hint to avoid the memory order violation in most situations. The PAUSE instruction can improve the performance of the processors supporting Intel Hyper-Threading Technology when executing “spin-wait loops”. With Pause instruction, processors are able to avoid the memory order violation and pipeline flush, and reduce power consumption through pipeline stall.
The latency of the PAUSE instruction in prior generation microarchitectures is about 10 cycles, whereas in Skylake microarchitecture it has been extended to as many as 140 cycles.
The PAUSE instruction can improves the performance of processors supporting Intel Hyper-Threading Technology when executing “spin-wait loops” and other routines where one thread is accessing a shared lock or semaphore in a tight polling loop. When executing a spin-wait loop, the processor can suffer a severe performance penalty when exiting the loop because it detects a possible memory order violation and flushes the core processor’s pipeline. The PAUSE instruction provides a hint to the processor that the code sequence is a spin-wait loop. The processor uses this hint to avoid the memory order violation and prevent the pipeline flush. In addition, the PAUSE instruction de- pipelines the spin-wait loop to prevent it from consuming execution resources excessively and consume power needlessly. (See Section 8.10.6.1, “Use the PAUSE Instruction in Spin-Wait Loops,” for more information about using the PAUSE instruction with IA-32 processors supporting Intel Hyper-Threading Technology.)
On arm64 we have seen on several databases that ISB (instruction synchronization barrier) is better to use than yield in a spin loop. The yield instruction is a nop. The isb instruction puts the processor to sleep for some short time. isb is a good equivalent to the pause instruction on x86.
Below is an experiment that shows the effects of yield and isb on Arm64 and the time of a pause instruction on x86 Intel processors. The micro-benchmarks use https://github.com/google/benchmark.git
测试代码
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
$ cat a.cc static void BM_scalar_increment(benchmark::State& state) { int i = 0; for (auto _ : state) benchmark::DoNotOptimize(i++); } BENCHMARK(BM_scalar_increment); static void BM_yield(benchmark::State& state) { for (auto _ : state) asm volatile("yield"::); } BENCHMARK(BM_yield); static void BM_isb(benchmark::State& state) { for (auto _ : state) asm volatile("isb"::); } BENCHMARK(BM_isb); BENCHMARK_MAIN();
现场DBA通过监控看到MySQL CPU不到20%,没有慢查询,并且尝试用client越过所有中间环节直接压其中一个MySQL,可以将 MySQL CPU 跑满,这时的QPS大概是38000(对应上面的场景client QPS为700的时候,单个MySQL上的QPS才跑到6000) 所以排除了MySQL的嫌疑(这个推理不够严谨为后面排查埋下了大坑)。
多线程竞争锁的时候,加锁失败的线程会“忙等待”,直到它拿到锁。什么叫“忙等待”呢?它并不意味着一直执行 CAS 函数,而是会与 CPU 紧密配合 ,它通过 CPU 提供的 PAUSE 指令,减少循环等待时的cache ping-pong和耗电量;对于单核 CPU,忙等待并没有意义,此时它会主动把线程休眠。
X86 PAUSE 指令
X86设计了Pause指令,也就是调用 Pause 指令的代码会抢着 CPU 不释放,但是CPU 会打个盹,比如 10个时钟周期,相对一次上下文切换是大几千个时钟周期。
这样应用一旦自旋抢锁失败可以先 Pause 一下,只是这个Pause 时间对于 MySQL 来说还不够久,所以需要增加参数 innodb_spin_wait_delay 来将休息时间放大一些。
节能(CPU可以休息、但是不让出来),CPU Pause 的时候你从 top 能看到 CPU 100%,但是不耗能。
所以有了 Pause 指令后能够提高超线程的利用率,节能,减少上下文切换提高自旋锁的效率。
The PAUSE instruction is first introduced for Intel Pentium 4 processor to improve the performance of “spin-wait loop”. The PAUSE instruction is typically used with software threads executing on two logical processors located in the same processor core, waiting for a lock to be released. Such short wait loops tend to last between tens and a few hundreds of cycles. When the wait loop is expected to last for thousands of cycles or more, it is preferable to yield to the operating system by calling one of the OS synchronization API functions, such as WaitForSingleObject on Windows OS.
An Intel® processor suffers a severe performance penalty when exiting the loop because it detects a possible memory order violation. The PAUSE instruction provides a hint to the processor that the code sequence is a spin-wait loop. The processor uses this hint to avoid the memory order violation in most situations. The PAUSE instruction can improve the performance of the processors supporting Intel Hyper-Threading Technology when executing “spin-wait loops”. With Pause instruction, processors are able to avoid the memory order violation and pipeline flush, and reduce power consumption through pipeline stall.
The latency of the PAUSE instruction in prior generation microarchitectures is about 10 cycles, whereas in Skylake microarchitecture it has been extended to as many as 140 cycles.
The PAUSE instruction can improves the performance of processors supporting Intel Hyper-Threading Technology when executing “spin-wait loops” and other routines where one thread is accessing a shared lock or semaphore in a tight polling loop. When executing a spin-wait loop, the processor can suffer a severe performance penalty when exiting the loop because it detects a possible memory order violation and flushes the core processor’s pipeline. The PAUSE instruction provides a hint to the processor that the code sequence is a spin-wait loop. The processor uses this hint to avoid the memory order violation and prevent the pipeline flush. In addition, the PAUSE instruction de- pipelines the spin-wait loop to prevent it from consuming execution resources excessively and consume power needlessly. (See Section 8.10.6.1, “Use the PAUSE Instruction in Spin-Wait Loops,” for more information about using the PAUSE instruction with IA-32 processors supporting Intel Hyper-Threading Technology.)
所以如果这个reset是MySQL发出来的话,因为MySQL发出的前一个包的 identification 是23403,所以这个必须是23404,实际上居然是13502(而且还和Navicat发出的 Use Database包是同一个 identification),这是非常不对的。
所以可以大胆猜测,这里有个中间设备收到 Use Database后触发了不放行的逻辑,于是冒充 Navicat给 MySQL Server发了reset包,src ip/src port/seq等都直接用Navicat的,identification也用Navicat的,所以 MySQL Server收到的 Reset看起来很正常(啥都是对的,没留下一点冒充的痕迹)。
[ren@vb 18:14 /home/ren]
$sudo mkdir /media/ren/hd
[ren@vb 18:15 /home/ren]
$sudo mount /dev/sd
sda sda1 sda2 sda5 sdb sdb1 sdb2 sdb5 sdc sdc1 sdc2 sdc5
[ren@vb 18:15 /home/ren]
$sudo mount /dev/sdc1 /media/ren/hd
mount: /dev/sdc1 is write-protected, mounting read-only
mount: wrong fs type, bad option, bad superblock on /dev/sdc1,
missing codepage or helper program, or other error
In some cases useful info is found in syslog - try
dmesg | tail or so.
The new socket option allows multiple sockets on the same host to bind to the same port, and is intended to improve the performance of multithreaded network server applications running on top of multicore systems.
Product of data link’s capacity and its end-to-end delay. The result is the maximum amount of unacknowledged data that can be in flight at any point in time.
ss -itmpn dst "10.81.212.8"
State Recv-Q Send-Q Local Address:Port Peer Address:Port
ESTAB 0 0 10.xx.xx.xxx:22 10.yy.yy.yyy:12345 users:(("sshd",pid=1442,fd=3))
skmem:(r0,rb369280,t0,tb87040,f4096,w0,o0,bl0,d92)
Here we can see this socket has Receive Buffer 369280 bytes, and Transmit Buffer 87040 bytes.Keep in mind the kernel will double any socket buffer allocation for overhead.
So a process asks for 256 KiB buffer with setsockopt(SO_RCVBUF) then it will get 512 KiB buffer space. This is described on man 7 tcp.
初始窗口计算的代码逻辑,重点在17行:
/* TCP initial congestion window as per rfc6928 */
#define TCP_INIT_CWND 10
/* 3. Try to fixup all. It is made immediately after connection enters
established state.
*/
void tcp_init_buffer_space(struct sock *sk)
{
int tcp_app_win = sock_net(sk)->ipv4.sysctl_tcp_app_win;
struct tcp_sock *tp = tcp_sk(sk);
int maxwin;
if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK))
tcp_sndbuf_expand(sk);
//初始最大接收窗口计算过程
tp->rcvq_space.space = min_t(u32, tp->rcv_wnd, TCP_INIT_CWND * tp->advmss);
tcp_mstamp_refresh(tp);
tp->rcvq_space.time = tp->tcp_mstamp;
tp->rcvq_space.seq = tp->copied_seq;
maxwin = tcp_full_space(sk);
if (tp->window_clamp >= maxwin) {
tp->window_clamp = maxwin;
if (tcp_app_win && maxwin > 4 * tp->advmss)
tp->window_clamp = max(maxwin -
(maxwin >> tcp_app_win),
4 * tp->advmss);
}
/* Force reservation of one segment. */
if (tcp_app_win &&
tp->window_clamp > 2 * tp->advmss &&
tp->window_clamp + tp->advmss > maxwin)
tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss);
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp);
tp->snd_cwnd_stamp = tcp_jiffies32;
}
protected int socketRecvBuffer = 32 * 1024; //接收32K protected int socketSendBuffer = 64 * 1024; //发送64K,实际会分配128K
// If bufs set 0, using '/etc/sysctl.conf' system settings on default // refer: net.ipv4.tcp_wmem / net.ipv4.tcp_rmem if (socketRecvBuffer > 0) { socket.setReceiveBufferSize(socketRecvBuffer); } if (socketSendBuffer > 0) { socket.setSendBufferSize(socketSendBuffer); }
#!/usr/bin/stap
# Simple probe to detect when a process is waiting for more socket send
# buffer memory. Usually means the process is doing writes larger than the
# socket send buffer size or there is a slow receiver at the other side.
# Increasing the socket's send buffer size might help decrease application
# latencies, but it might also make it worse, so buyer beware.
probe kernel.function("sk_stream_wait_memory")
{
printf("%u: %s(%d) blocked on full send buffern",
gettimeofday_us(), execname(), pid())
}
probe kernel.function("sk_stream_wait_memory").return
{
printf("%u: %s(%d) recovered from full send buffern",
gettimeofday_us(), execname(), pid())
}
# Typical output: timestamp in microseconds: procname(pid) event
#
# 1218230114875167: python(17631) blocked on full send buffer
# 1218230114876196: python(17631) recovered from full send buffer
# 1218230114876271: python(17631) blocked on full send buffer
# 1218230114876479: python(17631) recovered from full send buffer
2024 Netflix: Investigation of a Cross-regional Network Performance Issue 因为内核升级去掉了内核参数 sysctl_tcp_adv_win_scale,换了一个新的计算方式,导致原来30秒 内能传输完毕的请求在新内核机制下传输不完,从而导致了业务端的请求超时 This commit obsoleted sysctl_tcp_adv_win_scale and introduced a scaling_ratio that can more accurately calculate the overhead or window size, which is the right thing to do. With the change, the window size is now rcvbuf * scaling_ratio. 简而言之,内核升级后,接收缓存大小减半。因此,吞吐量也减半,导致数据传输时间翻倍。
receive window is not fully opened immediately. Linux keeps the receive window small, as it tries to predict the metadata cost and avoid overshooting the memory budget, therefore hitting TCP collapse. By default, with the net.ipv4.tcp_adv_win_scale=1, the upper limit for the advertised window is 50% of “free” memory. rcv_ssthresh starts up with 64KiB and grows linearly up to that limit.
Using a large chunk of receive buffer space for the metadata is not really what the programmer wants. To counter that, when the socket is under memory pressure complex logic is run with the intention of freeing some space. One of the operations is tcp_collapse and it will merge adjacent TCP packets into one larger sk_buff. This behavior is pretty much a garbage collection (GC)—and as everyone knows, when the garbage collection kicks in, the latency must spike.
################################################################# # tcp_retransmit.stp # Author: Yang Bingwu (detailyang) <detailyang@gmail.com> # This systemtap script will prints the tcp retransmission packet #################################################################
global record% global cwnd_record%
probe begin { log("Printing tcp retransmission") }
cat tcpsynbl_example.txt
Demonstrations of tcpsynbl, the Linux bpftrace/eBPF version.
This tool shows the TCP SYN backlog size during SYN arrival as a histogram.
This lets you see how close your applications are to hitting the backlog limit
and dropping SYNs (causing performance issues with SYN retransmits). For
example:
# ./tcpsynbl.bt
Attaching 4 probes...
Tracing SYN backlog size. Ctrl-C to end.
^C
@backlog[backlog limit]: histogram of backlog size
@backlog[500]:
[0] 2266 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@|
[1] 3 | |
[2, 4) 1 | |
cat tcpaccept_example.txt
Demonstrations of tcpaccept, the Linux bpftrace/eBPF version.
This tool traces the kernel function accepting TCP socket connections (eg, a passive connection via accept(); not connect()). Some example output (IP addresses changed to protect the innocent): # ./tcpaccept Tracing tcp accepts. Hit Ctrl-C to end. TIME PID COMM RADDR RPORT LADDR LPORT BL 00:34:19 3949061 nginx 10.228.22.228 44226 10.229.20.169 8080 0/128 00:34:19 3951399 ruby 127.0.0.1 52422 127.0.0.1 8000 0/128 00:34:19 3949062 nginx 10.228.23.128 35408 10.229.20.169 8080 0/128
This output shows three connections, an IPv4 connections to PID 1463622, a “redis-server” process listening on port 6379, and one IPv6 connection to a “thread.rb” process listening on port 8000. The remote address and port are also printed, and the accept queue current size as well as maximum size are shown. The overhead of this tool should be negligible, since it is only tracing the kernel function performing accept. It is not tracing every packet and then filtering. This tool only traces successful TCP accept()s. Connection attempts to closed ports will not be shown (those can be traced via other functions). There is another version of this tool in bcc: https://github.com/iovisor/bcc
--- 1687260767618 ns (100.00%), 91083 samples [ 0] ch.qos.logback.classic.sift.SiftingAppender [ 1] ch.qos.logback.core.AppenderBase.doAppend [ 2] ch.qos.logback.core.spi.AppenderAttachableImpl.appendLoopOnAppenders [ 3] ch.qos.logback.classic.Logger.appendLoopOnAppenders [ 4] ch.qos.logback.classic.Logger.callAppenders [ 5] ch.qos.logback.classic.Logger.buildLoggingEventAndAppend [ 6] ch.qos.logback.classic.Logger.filterAndLog_0_Or3Plus [ 7] ch.qos.logback.classic.Logger.info [ 8] com.taobao.tddl.common.utils.logger.slf4j.Slf4jLogger.info [ 9] com.taobao.tddl.common.utils.logger.support.FailsafeLogger.info [10] com.alibaba.cobar.server.util.LogUtils.recordSql [11] com.alibaba.cobar.server.ServerConnection.innerExecute [12] com.alibaba.cobar.server.ServerConnection.innerExecute [13] com.alibaba.cobar.server.ServerConnection$1.run [14] com.taobao.tddl.common.utils.thread.FlowControlThreadPool$RunnableAdapter.run [15] java.util.concurrent.Executors$RunnableAdapter.call [16] java.util.concurrent.FutureTask.run [17] java.util.concurrent.ThreadPoolExecutor.runWorker [18] java.util.concurrent.ThreadPoolExecutor$Worker.run [19] java.lang.Thread.run "ServerExecutor-3-thread-480" #753 daemon prio=5 os_prio=0 tid=0x00007f8265842000 nid=0x26f1 waiting for monitor entry [0x00007f82270bf000] java.lang.Thread.State: BLOCKED (on object monitor) at ch.qos.logback.core.AppenderBase.doAppend(AppenderBase.java:64) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) at ch.qos.logback.core.spi.AppenderAttachableImpl.appendLoopOnAppenders(AppenderAttachableImpl.java:48) at ch.qos.logback.classic.Logger.appendLoopOnAppenders(Logger.java:282) at ch.qos.logback.classic.Logger.callAppenders(Logger.java:269) at ch.qos.logback.classic.Logger.buildLoggingEventAndAppend(Logger.java:470) at ch.qos.logback.classic.Logger.filterAndLog_0_Or3Plus(Logger.java:424) at ch.qos.logback.classic.Logger.info(Logger.java:628) at com.taobao.tddl.common.utils.logger.slf4j.Slf4jLogger.info(Slf4jLogger.java:42) at com.taobao.tddl.common.utils.logger.support.FailsafeLogger.info(FailsafeLogger.java:102) at com.alibaba.cobar.server.util.LogUtils.recordSql(LogUtils.java:115) at com.alibaba.cobar.server.ServerConnection.innerExecute(ServerConnection.java:874) - locked <0x00007f87382cb108> (a com.alibaba.cobar.server.ServerConnection) at com.alibaba.cobar.server.ServerConnection.innerExecute(ServerConnection.java:569) - locked <0x00007f87382cb108> (a com.alibaba.cobar.server.ServerConnection) at com.alibaba.cobar.server.ServerConnection$1.run(ServerConnection.java:402) at com.taobao.tddl.common.utils.thread.FlowControlThreadPool$RunnableAdapter.run(FlowControlThreadPool.java:480) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1152) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:627) at java.lang.Thread.run(Thread.java:861)
- waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - locked <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender) - waiting to lock <0x00007f866dcec208> (a ch.qos.logback.classic.sift.SiftingAppender)
/**
* Creates a new buffered output stream to write data to the
* specified underlying output stream.
*
* @param out the underlying output stream.
*/
public BufferedOutputStream(OutputStream out) {
this(out, 8192);
}