2x E5-2650
128 GB RAM
12 x 4 TB 7200 RPM SATA drives connected to an HP H220 HBA
Dual port 10 GB NIC
The drives are configured as one large RAID-10 volume with mdadm, filesystem is XFS. The OS is not installed on the drive - we PXE boot a CentOS image we've built with minimal packages installed, and do the OS configuration via puppet. Originally, the hosts were running CentOS 6.5, with Kafka 0.8.1, without issue. We recently upgraded to CentOS 7.2 and Kafka 0.9, and that's when the trouble started.
What we're seeing is that when the weekly raid-check script executes, performance nose dives, and I/O wait skyrockets. The raid check starts out fairly fast (20000K/sec - the limit that's been set), but then quickly drops down to about 4000K/Sec. dev.raid.speed sysctls are at the defaults:
Code: Select all
dev.raid.speed_limit_max = 200000
dev.raid.speed_limit_min = 1000
Code: Select all
[root@r1k1log] # iostat 1 10
Linux 3.10.0-327.18.2.el7.x86_64 (r1k1) 05/24/16 _x86_64_ (32 CPU)
avg-cpu: %user %nice %system %iowait %steal %idle
8.80 0.06 1.89 14.79 0.00 74.46
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 52.59 2033.16 10682.78 1210398902 6359779847
sdb 52.46 2031.25 10682.78 1209265338 6359779847
sdc 52.40 2033.21 10683.53 1210433924 6360229587
sdd 52.22 2031.16 10683.53 1209212513 6360229587
sdf 52.20 2031.17 10682.06 1209216701 6359354331
sdg 52.62 2033.22 10684.17 1210437080 6360606756
sdh 52.57 2031.21 10684.17 1209242746 6360606756
sde 51.67 2033.17 10682.06 1210408935 6359354331
sdj 51.90 2031.13 10684.48 1209191501 6360795559
sdi 52.47 2033.16 10684.48 1210399262 6360795559
sdk 52.09 2033.15 10684.36 1210396915 6360724971
sdl 51.95 2031.20 10684.36 1209235241 6360724971
md127 138.20 74.49 64101.35 44348810 38161468777
avg-cpu: %user %nice %system %iowait %steal %idle
8.57 0.09 1.33 26.19 0.00 63.81
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 28.00 512.00 8416.00 512 8416
sdb 28.00 512.00 8416.00 512 8416
sdc 25.00 448.00 8876.00 448 8876
sdd 24.00 448.00 8364.00 448 8364
sdf 23.00 448.00 8192.00 448 8192
sdg 24.00 512.00 7680.00 512 7680
sdh 24.00 512.00 7680.00 512 7680
sde 23.00 448.00 8192.00 448 8192
sdj 23.00 512.00 7680.00 512 7680
sdi 23.00 512.00 7680.00 512 7680
sdk 23.00 512.00 7680.00 512 7680
sdl 23.00 512.00 7680.00 512 7680
md127 101.00 0.00 48012.00 0 48012
avg-cpu: %user %nice %system %iowait %steal %idle
6.50 0.00 1.04 24.27 0.00 68.19
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 26.00 512.00 9216.00 512 9216
sdb 26.00 512.00 9216.00 512 9216
sdc 27.00 576.00 9204.00 576 9204
sdd 28.00 576.00 9716.00 576 9716
sdf 31.00 768.00 9728.00 768 9728
sdg 28.00 512.00 10240.00 512 10240
sdh 28.00 512.00 10240.00 512 10240
sde 31.00 768.00 9728.00 768 9728
sdj 28.00 512.00 9744.00 512 9744
sdi 28.00 512.00 9744.00 512 9744
sdk 27.00 512.00 9728.00 512 9728
sdl 27.00 512.00 9728.00 512 9728
md127 114.00 0.00 57860.00 0 57860
avg-cpu: %user %nice %system %iowait %steal %idle
9.24 0.00 1.32 20.02 0.00 69.42
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 50.00 512.00 20408.00 512 20408
sdb 50.00 512.00 20408.00 512 20408
sdc 48.00 512.00 19984.00 512 19984
sdd 48.00 512.00 19984.00 512 19984
sdf 50.00 704.00 19968.00 704 19968
sdg 47.00 512.00 19968.00 512 19968
sdh 47.00 512.00 19968.00 512 19968
sde 50.00 704.00 19968.00 704 19968
sdj 48.00 512.00 19972.00 512 19972
sdi 48.00 512.00 19972.00 512 19972
sdk 48.00 512.00 19980.00 512 19980
sdl 48.00 512.00 19980.00 512 19980
md127 241.00 0.00 120280.00 0 120280
avg-cpu: %user %nice %system %iowait %steal %idle
7.98 0.00 0.98 18.42 0.00 72.63
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 39.00 640.00 14076.00 640 14076
sdb 39.00 640.00 14076.00 640 14076
sdc 36.00 512.00 14324.00 512 14324
sdd 36.00 512.00 14324.00 512 14324
sdf 36.00 576.00 13824.00 576 13824
sdg 43.00 1024.00 13824.00 1024 13824
sdh 43.00 1024.00 13824.00 1024 13824
sde 36.00 576.00 13824.00 576 13824
sdj 44.00 1024.00 14104.00 1024 14104
sdi 44.00 1024.00 14104.00 1024 14104
sdk 45.00 1024.00 14336.00 1024 14336
sdl 45.00 1024.00 14336.00 1024 14336
md127 168.00 0.00 84488.00 0 84488
avg-cpu: %user %nice %system %iowait %steal %idle
7.39 0.00 1.01 19.48 0.00 72.13
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 22.00 896.00 4096.00 896 4096
sdb 22.00 896.00 4096.00 896 4096
sdc 19.00 640.00 4344.00 640 4344
sdd 19.00 640.00 4344.00 640 4344
sdf 18.00 512.00 5120.00 512 5120
sdg 18.00 512.00 5120.00 512 5120
sdh 18.00 512.00 5120.00 512 5120
sde 18.00 512.00 5120.00 512 5120
sdj 18.00 512.00 4624.00 512 4624
sdi 18.00 512.00 4624.00 512 4624
sdk 18.00 512.00 4608.00 512 4608
sdl 18.00 512.00 4608.00 512 4608
md127 57.00 0.00 27912.00 0 27912
avg-cpu: %user %nice %system %iowait %steal %idle
10.92 0.00 1.58 21.84 0.00 65.66
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 23.00 576.00 7168.00 576 7168
sdb 23.00 576.00 7168.00 576 7168
sdc 29.00 896.00 7680.00 896 7680
sdd 29.00 896.00 7680.00 896 7680
sdf 31.00 1024.00 7680.00 1024 7680
sdg 31.00 1024.00 7680.00 1024 7680
sdh 31.00 1024.00 7680.00 1024 7680
sde 31.00 1024.00 7680.00 1024 7680
sdj 30.00 1024.00 7168.00 1024 7168
sdi 31.00 1024.00 7680.00 1024 7680
sdk 32.00 1024.00 7424.00 1024 7424
sdl 32.00 1024.00 7424.00 1024 7424
md127 89.00 0.00 44800.00 0 44800
avg-cpu: %user %nice %system %iowait %steal %idle
13.89 0.03 2.63 21.54 0.00 61.91
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 30.00 960.00 7680.00 960 7680
sdb 30.00 960.00 7680.00 960 7680
sdc 32.00 1024.00 7684.00 1024 7684
sdd 32.00 1024.00 7684.00 1024 7684
sdf 31.00 1024.00 7680.00 1024 7680
sdg 31.00 1024.00 7680.00 1024 7680
sdh 31.00 1024.00 7680.00 1024 7680
sde 31.00 1024.00 7680.00 1024 7680
sdj 32.00 1024.00 8192.00 1024 8192
sdi 31.00 1024.00 7680.00 1024 7680
sdk 26.00 704.00 7680.00 704 7680
sdl 26.00 704.00 7680.00 704 7680
md127 92.00 0.00 46596.00 0 46596
avg-cpu: %user %nice %system %iowait %steal %idle
14.24 0.00 2.22 19.89 0.00 63.65
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 33.00 1024.00 7244.00 1024 7244
sdb 33.00 1024.00 7244.00 1024 7244
sdc 31.00 1024.00 7668.00 1024 7668
sdd 31.00 1024.00 7668.00 1024 7668
sdf 31.00 1024.00 7680.00 1024 7680
sdg 26.00 768.00 6672.00 768 6672
sdh 26.00 768.00 6672.00 768 6672
sde 31.00 1024.00 7680.00 1024 7680
sdj 21.00 512.00 6656.00 512 6656
sdi 21.00 512.00 6656.00 512 6656
sdk 27.00 832.00 7168.00 832 7168
sdl 27.00 832.00 7168.00 832 7168
md127 88.00 0.00 43088.00 0 43088
avg-cpu: %user %nice %system %iowait %steal %idle
8.02 0.13 1.42 23.90 0.00 66.53
Device: tps kB_read/s kB_wrtn/s kB_read kB_wrtn
sda 30.00 1024.00 7168.00 1024 7168
sdb 30.00 1024.00 7168.00 1024 7168
sdc 29.00 960.00 7168.00 960 7168
sdd 29.00 960.00 7168.00 960 7168
sdf 23.00 512.00 7668.00 512 7668
sdg 28.00 768.00 7680.00 768 7680
sdh 28.00 768.00 7680.00 768 7680
sde 23.00 512.00 7668.00 512 7668
sdj 30.00 1024.00 6672.00 1024 6672
sdi 30.00 1024.00 6672.00 1024 6672
sdk 30.00 1024.00 7168.00 1024 7168
sdl 30.00 1024.00 7168.00 1024 7168
md127 87.00 0.00 43524.00 0 43524
Code: Select all
[root@r1k1] # cat /proc/mdstat
Personalities : [raid10]
md127 : active raid10 sdf[5] sdi[8] sdh[7] sdk[10] sdb[1] sdj[9] sdc[2] sdd[3] sdl[11] sde[13] sdg[12] sda[0]
23441323008 blocks super 1.2 512K chunks 2 near-copies [12/12] [UUUUUUUUUUUU]
[======>..............] check = 30.8% (7237496960/23441323008) finish=62944.5min speed=4290K/sec
unused devices: <none>
[root@r1k1] # mdadm --detail /dev/md127
/dev/md127:
Version : 1.2
Creation Time : Thu Sep 18 09:57:57 2014
Raid Level : raid10
Array Size : 23441323008 (22355.39 GiB 24003.91 GB)
Used Dev Size : 3906887168 (3725.90 GiB 4000.65 GB)
Raid Devices : 12
Total Devices : 12
Persistence : Superblock is persistent
Update Time : Tue May 24 15:32:56 2016
State : active, checking
Active Devices : 12
Working Devices : 12
Failed Devices : 0
Spare Devices : 0
Layout : near=2
Chunk Size : 512K
Check Status : 30% complete
Name : localhost:kafka
UUID : b6b98e3e:65ee06c3:3599d781:98908041
Events : 2459193
Number Major Minor RaidDevice State
0 8 0 0 active sync set-A /dev/sda
1 8 16 1 active sync set-B /dev/sdb
2 8 32 2 active sync set-A /dev/sdc
3 8 48 3 active sync set-B /dev/sdd
13 8 64 4 active sync set-A /dev/sde
5 8 80 5 active sync set-B /dev/sdf
12 8 96 6 active sync set-A /dev/sdg
7 8 112 7 active sync set-B /dev/sdh
8 8 128 8 active sync set-A /dev/sdi
9 8 144 9 active sync set-B /dev/sdj
10 8 160 10 active sync set-A /dev/sdk
11 8 176 11 active sync set-B /dev/sdl
We have 4 other Kafka clusters, however they're HP DL180 G6 servers. We completed the same CentOS 6.5 -> 7.2/Kafka 0.8 -> 0.9 upgrade on those clusters, and there has been no impact to their performance.
We've been banging our heads against the wall for a few weeks now, really hoping someone from the community can point us in the right direction.