drbd+heartbeat+mysql slave 方案

测试机器IP:public
10.2.0.181 — drbd+heartbeat
10.2.0.182 — drbd+heartbeat
10.2.0.183 — mysql slave
10.2.0.184 — mysql slave

private:

192.168.0.181 –db-181
192.168.0.182 –db-182

[root@db-181 ~]# cat /etc/hosts
# Do not remove the following line, or various programs
# that require network functionality will fail.
127.0.0.1 localhost.localdomain localhost
::1 localhost6.localdomain6 localhost6
10.2.0.181 db-181
10.2.0.182 db-182

[root@db-manager ~]# uname -a
Linux db-manager 2.6.18-194.el5 #1 SMP Mon Mar 29 22:10:29 EDT 2010 x86_64 x86_64 x86_64 GNU/Linux
[root@db-manager ~]# cat /etc/issue
Enterprise Linux Enterprise Linux Server release 5.5 (Carthage)
Kernel \r on an \m

[root@db-manager ~]#

安装如下RPM包
-rw-r–r– 1 root root 139462 Mar 26 15:00 kmod-drbd83-xen-8.3.12-1.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 139033 Mar 26 15:00 kmod-drbd83-8.3.12-1.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 241248 Mar 26 15:00 drbd83-8.3.12-2.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 18198017 Mar 26 22:44 MySQL-client-5.5.16-1.rhel5.x86_64.rpm
-rw-r–r– 1 root root 3821401 Mar 26 22:45 MySQL-devel-5.5.16-1.rhel5.x86_64.rpm
-rw-r–r– 1 root root 54806458 Mar 26 22:46 MySQL-server-5.5.16-1.rhel5.x86_64.rpm
-rw-r–r– 1 root root 267496 Mar 27 11:01 heartbeat-devel-2.1.3-3.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 204099 Mar 27 11:01 heartbeat-ldirectord-2.1.3-3.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 225376 Mar 27 11:02 heartbeat-pils-2.1.3-3.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 1873215 Mar 27 11:02 heartbeat-2.1.3-3.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 356671 Mar 27 11:02 heartbeat-stonith-2.1.3-3.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 307426 Mar 27 11:02 heartbeat-gui-2.1.3-3.el5.centos.x86_64.rpm
-rw-r–r– 1 root root 243886 Mar 27 11:04 libnet-1.1.2.1-2.rf.x86_64.rpm

下载地址
http://www.drbd.org/download/packages/
http://mirror.centos.org/centos/5/extras/x86_64/RPMS/

配置如下文件 db-181 db-182:

[root@db-181 drbd.d]# cat /etc/drbd.conf
#
# please have a a look at the example configuration file in
# /usr/share/doc/drbd83/drbd.conf
#
include “/etc/drbd.d/global_common.conf”;
include “/etc/drbd.d/*.res”;

[root@db-181 drbd.d]# cat /etc/drbd.d/global_common.conf
global {
usage-count yes;
# minor-count dialog-refresh disable-ip-verification
}

common {
protocol C;
handlers {
pri-on-incon-degr “/usr/lib/drbd/notify-pri-on-incon-degr.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f”;
pri-lost-after-sb “/usr/lib/drbd/notify-pri-lost-after-sb.sh; /usr/lib/drbd/notify-emergency-reboot.sh; echo b > /proc/sysrq-trigger ; reboot -f”;
local-io-error “/usr/lib/drbd/notify-io-error.sh; /usr/lib/drbd/notify-emergency-shutdown.sh; echo o > /proc/sysrq-trigger ; halt -f”;
# fence-peer “/usr/lib/drbd/crm-fence-peer.sh”;
# split-brain “/usr/lib/drbd/notify-split-brain.sh root”;
# out-of-sync “/usr/lib/drbd/notify-out-of-sync.sh root”;
# before-resync-target “/usr/lib/drbd/snapshot-resync-target-lvm.sh -p 15 — -c 16k”;
# after-resync-target /usr/lib/drbd/unsnapshot-resync-target-lvm.sh;
}

startup {
# wfc-timeout degr-wfc-timeout outdated-wfc-timeout wait-after-sb
}

disk {
# on-io-error fencing use-bmbv no-disk-barrier no-disk-flushes
# no-disk-drain no-md-flushes max-bio-bvecs
}

net {
# sndbuf-size rcvbuf-size timeout connect-int ping-int ping-timeout max-buffers
# max-epoch-size ko-count allow-two-primaries cram-hmac-alg shared-secret
# after-sb-0pri after-sb-1pri after-sb-2pri data-integrity-alg no-tcp-cork
}

syncer { rate 10M; ———-指定同步速率
}
}

[root@db-181 drbd.d]# cat /etc/drbd.d/r0.res
resource r0 {
on db-181 {
device /dev/drbd0;
disk /dev/sda6;
address 192.168.0.181:7898;
meta-disk internal;
}
on db-182 {
device /dev/drbd0;
disk /dev/sda6;
address 192.168.0.182:7898;
meta-disk internal;
}
}

db-181同上面配置

[root@db-181 drbd.d]# fdisk -l

Disk /dev/sda: 299.4 GB, 299439751168 bytes
255 heads, 63 sectors/track, 36404 cylinders
Units = cylinders of 16065 * 512 = 8225280 bytes

Device Boot Start End Blocks Id System
/dev/sda1 * 1 25 200781 83 Linux
/dev/sda2 26 6399 51199155 83 Linux
/dev/sda3 6400 12773 51199155 83 Linux
/dev/sda4 12774 36404 189816007+ 5 Extended
/dev/sda5 12774 16852 32764536 82 Linux swap / Solaris
/dev/sda6 16853 36404 157051408+ 83 Linux

[root@db-181 drbd.d]# ls -l /dev/sda6
brw-r—– 1 root disk 8, 6 Mar 27 18:50 /dev/sda6

在 db-181 db-182 上执行:

mkfs.ext3 /dev/sda6
dd if=/dev/zero of=/dev/sda6 bs=1M count=1;

/etc/init.d/drbd start

在主节点上执行如下命令 并进行格式化:

drbdadm — –overwrite-data-of-peer primary all

查看状态

watch -n 1 cat /proc/drbd

Every 2.0s: cat /proc/drbd Mon Mar 26 17:49:40 2012

version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:SyncSource ro:Primary/Secondary ds:UpToDate/Inconsistent C r—–
ns:6249344 nr:0 dw:2601216 dr:3648325 al:1248 bm:225 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:150637196
[>………………..] sync’ed: 3.9% (147104/153056)M
finish: 3:58:01 speed: 10,536 (15,268) K/sec

开始同步大概3个小时之后 同步完成 继续查看状态:

[root@db-181 drbd.d]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r—–
ns:888 nr:1260 dw:2148 dr:5541 al:11 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-181 drbd.d]#

UpToDate/UpToDate 说明同步已经完成

在主节点挂载/data目录 注意 只能在主节点挂载文件系统:

mkfs.ext3 /dev/drbd0
mount -t ext3 /dev/drbd0 /data

[root@db-181 ~]# df -k
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda3 49594228 2111424 44922848 5% /
/dev/sda2 49594228 184276 46849996 1% /home
/dev/sda1 194442 12183 172220 7% /boot
tmpfs 4082228 0 4082228 0% /dev/shm
/dev/drbd0 154580772 222688 146505756 1% /data

下面安装mysql

mysql> select version();
+————+
| version() |
+————+
| 5.5.16-log |
+————+
1 row in set (0.00 sec)

修改datadir 为/data/mysql

181 /data 上的文件会自动复制到182上

下面安装heartbeat

NOTE: 以下3个配置文件中的内容在两个机器上均需要保持一致。

[root@db-181 drbd.d]# cat /etc/ha.d/ha.cf
debugfile /var/log/ha-debug –debug位置
logfile /var/log/ha-log –log 位置
autojoin none
logfacility local0 用于syslog()/logger的设备
keepalive 2 心跳频率,自己设定。1:表示1秒;200ms:表示200毫秒
deadtime 30 节点死亡时间阀值
warntime 10 发出警告时间
initdead 120
udpport 694
ucast eth1 192.168.0.182 采用网卡eth0的udp单播来通知心跳,eth1的IP
auto_failback off — 关闭主节点漂移
node db-181 — 主机名 uname -n
node db-182

ping 10.0.0.58 — 设置一个第三方可靠ip
respawn hacluster /usr/lib64/heartbeat/ipfail 该选项是可选配置,列出与
heartbeat一起启动和关闭的进程,该进程一般是和heartbeat集成的插件,这些进程
遇到故障可以自动重新启动。最常用的进程是ipfail,此进程用于检测和处理网络故障,
需要配合ping语句指定的ping node来检测网络的连通性。其中hacluster表示启动ipfail进程的身份
apiauth ipfail gid=haclient uid=hacluster 这里的hacluster用户以及haclient组,是安装完heartbeat后被自动创建的
deadping 5

[root@db-181 drbd.d]# cat /etc/ha.d/authkeys
auth 3
#1 crc
#2 sha1 HI!
3 md5 Hello!
[root@db-181 drbd.d]#

Note: chmod 600 /etc/ha.d/authkeys,否则,启动heartbeat服务时会报error

db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0

该配置文件必须在所有的节点上都要保持一致,最开头的节点名称,没有必要非的是当前节点的主机名,只保证我们选择的主节点(preferred note)上的服务都自动开启就够了,避免开启所有节点上的服务
提供了对mysql资源的启动和关闭,所以需要在/etc/ha.d/resource.d这个目录下去创建一个控制该资源启动脚本,命令为: ln -s /etc/init.d/mysqld /etc/ha.d/resource.d/mysql 最后需要制定一个VIP 绑定在eth0接口上

现在,heartbeat也已经配置完毕,分别在两台机器上启动heartbeat服务,执行命令:service heartbeat start

现在db-182为主节点:

[root@db-182 ~]# ifconfig -a
eth0 Link encap:Ethernet HWaddr D4:AE:52:B9:6C:B7
inet addr:10.2.0.182 Bcast:10.2.0.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:749654 errors:0 dropped:0 overruns:0 frame:0
TX packets:536770 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:205883569 (196.3 MiB) TX bytes:531576493 (506.9 MiB)
Interrupt:169 Memory:c0000000-c0012800

eth0:0 Link encap:Ethernet HWaddr D4:AE:52:B9:6C:B7
inet addr:10.2.0.250 Bcast:10.2.0.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
Interrupt:169 Memory:c0000000-c0012800

[root@db-182 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r—–
ns:9240732 nr:888 dw:9240704 dr:12374 al:266 bm:6 lo:0 pe:326 ua:0 ap:325 ep:1 wo:b oos:0

———————————————————————————————————

测试1 -切换测试:service heartbeat standby

db-182 日志如下:


heartbeat[4351]: 2012/03/28_15:53:30 info: db-182 wants to go standby [all]
heartbeat[4351]: 2012/03/28_15:53:30 info: standby: db-181 can take our all resources
heartbeat[21773]: 2012/03/28_15:53:30 info: give up all HA resources (standby).
ResourceManager[21786]: 2012/03/28_15:53:30 info: Releasing resource group: db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0
ResourceManager[21786]: 2012/03/28_15:53:30 info: Running /etc/ha.d/resource.d/IPaddr 10.2.0.250/24/eth0 stop
IPaddr[21853]: 2012/03/28_15:53:30 INFO: ifconfig eth0:0 down
IPaddr[21824]: 2012/03/28_15:53:30 INFO: Success
ResourceManager[21786]: 2012/03/28_15:53:30 info: Running /etc/init.d/mysql stop
ResourceManager[21786]: 2012/03/28_15:53:34 info: Running /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data ext3 stop
Filesystem[21959]: 2012/03/28_15:53:35 INFO: Running stop for /dev/drbd0 on /data
Filesystem[21959]: 2012/03/28_15:53:35 INFO: Trying to unmount /data
Filesystem[21959]: 2012/03/28_15:53:35 ERROR: Couldn’t unmount /data; trying cleanup with SIGTERM
Filesystem[21959]: 2012/03/28_15:53:35 INFO: Some processes on /data were signalled
Filesystem[21959]: 2012/03/28_15:53:36 ERROR: Couldn’t unmount /data; trying cleanup with SIGTERM
Filesystem[21959]: 2012/03/28_15:53:36 INFO: Some processes on /data were signalled
Filesystem[21959]: 2012/03/28_15:53:37 ERROR: Couldn’t unmount /data; trying cleanup with SIGTERM
Filesystem[21959]: 2012/03/28_15:53:37 INFO: Some processes on /data were signalled
Filesystem[21959]: 2012/03/28_15:53:38 ERROR: Couldn’t unmount /data; trying cleanup with SIGKILL
Filesystem[21959]: 2012/03/28_15:53:38 INFO: Some processes on /data were signalled
Filesystem[21959]: 2012/03/28_15:53:39 INFO: unmounted /data successfully
Filesystem[21948]: 2012/03/28_15:53:39 INFO: Success
ResourceManager[21786]: 2012/03/28_15:53:39 info: Running /etc/ha.d/resource.d/drbddisk r0 stop
heartbeat[21773]: 2012/03/28_15:53:39 info: all HA resource release completed (standby).
heartbeat[4351]: 2012/03/28_15:53:39 info: Local standby process completed [all].
heartbeat[4351]: 2012/03/28_15:53:42 WARN: 1 lost packet(s) for [db-181] [35995:35997]
heartbeat[4351]: 2012/03/28_15:53:42 info: remote resource transition completed.
heartbeat[4351]: 2012/03/28_15:53:42 info: No pkts missing from db-181!
heartbeat[4351]: 2012/03/28_15:53:42 info: Other node completed standby takeover of all resources.

db-181 日志如下:


heartbeat[3709]: 2012/03/28_15:52:21 info: db-182 wants to go standby [all]
heartbeat[3709]: 2012/03/28_15:52:30 info: standby: acquire [all] resources from db-182
heartbeat[9042]: 2012/03/28_15:52:30 info: acquire all HA resources (standby).
ResourceManager[9055]: 2012/03/28_15:52:30 info: Acquiring resource group: db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0
ResourceManager[9055]: 2012/03/28_15:52:30 info: Running /etc/ha.d/resource.d/drbddisk r0 start
Filesystem[9126]: 2012/03/28_15:52:30 INFO: Resource is stopped
ResourceManager[9055]: 2012/03/28_15:52:30 info: Running /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data ext3 start
Filesystem[9207]: 2012/03/28_15:52:30 INFO: Running start for /dev/drbd0 on /data
Filesystem[9196]: 2012/03/28_15:52:30 INFO: Success
ResourceManager[9055]: 2012/03/28_15:52:30 info: Running /etc/init.d/mysql start
IPaddr[9599]: 2012/03/28_15:52:33 INFO: Resource is stopped
ResourceManager[9055]: 2012/03/28_15:52:33 info: Running /etc/ha.d/resource.d/IPaddr 10.2.0.250/24/eth0 start
IPaddr[9697]: 2012/03/28_15:52:33 INFO: Using calculated netmask for 10.2.0.250: 255.255.255.0
IPaddr[9697]: 2012/03/28_15:52:33 INFO: eval ifconfig eth0:0 10.2.0.250 netmask 255.255.255.0 broadcast 10.2.0.255
IPaddr[9668]: 2012/03/28_15:52:33 INFO: Success
heartbeat[9042]: 2012/03/28_15:52:33 info: all HA resource acquisition completed (standby).
heartbeat[3709]: 2012/03/28_15:52:33 info: Standby resource acquisition done [all].
heartbeat[3709]: 2012/03/28_15:52:33 info: remote resource transition completed.

VIP 已经漂移到了10.2.0.181上:

[root@db-181 drbd.d]# ifconfig -a
eth0 Link encap:Ethernet HWaddr D4:AE:52:B9:6D:D0
inet addr:10.2.0.181 Bcast:10.2.0.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:381188 errors:0 dropped:0 overruns:0 frame:0
TX packets:39784 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:48349461 (46.1 MiB) TX bytes:5729098 (5.4 MiB)
Interrupt:169 Memory:c0000000-c0012800

eth0:0 Link encap:Ethernet HWaddr D4:AE:52:B9:6D:D0
inet addr:10.2.0.250 Bcast:10.2.0.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
Interrupt:169 Memory:c0000000-c0012800

[root@db-181 drbd.d]# ps -ef |grep mysql
root 9308 1 0 15:52 ? 00:00:00 /bin/sh /usr/bin/mysqld_safe –datadir=/data/mysql –pid-file=/data/mysql/db-181.pid
mysql 9559 9308 0 15:52 ? 00:00:00 /usr/sbin/mysqld –basedir=/usr –datadir=/data/mysql –plugin-dir=/usr/lib64/mysql/plugin –user=mysql –log-error=/var/log/mysqld.log –pid-file=/data/mysql/db-181.pid –socket=/data/mysql/mysql.sock –port=3306
root 9812 8756 0 15:54 pts/2 00:00:00 grep mysql

[root@db-181 drbd.d]#
[root@db-181 drbd.d]# df -k
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda3 49594228 1804700 45229572 4% /
/dev/sda2 49594228 184276 46849996 1% /home
/dev/sda1 194442 12183 172220 7% /boot
tmpfs 4082228 0 4082228 0% /dev/shm
/dev/drbd0 154580772 708864 146019580 1% /data
[root@db-181 drbd.d]#

[root@db-181 drbd.d]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r—–
ns:1148 nr:9327912 dw:9329060 dr:27602 al:12 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0

———————————————————————————————————

测试二 : reboot standby 节点

[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r—–
ns:656 nr:940 dw:6264 dr:134990 al:15 bm:25 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-181 ~]# reboot

[root@db-182 ~]# tail -f /var/log/ha-log

heartbeat[1704]: 2012/03/29_10:17:35 info: Link db-181:eth1 dead.
ipfail[1730]: 2012/03/29_10:17:35 info: Link Status update: Link db-181/eth1 now has status dead
ipfail[1730]: 2012/03/29_10:17:36 info: Asking other side for ping node count.
ipfail[1730]: 2012/03/29_10:17:36 info: Checking remote count of ping nodes.
heartbeat[1704]: 2012/03/29_10:17:59 WARN: node db-181: is dead
heartbeat[1704]: 2012/03/29_10:17:59 info: Dead node db-181 gave up resources.
ipfail[1730]: 2012/03/29_10:17:59 info: Status update: Node db-181 now has status dead
ipfail[1730]: 2012/03/29_10:18:01 info: NS: We are still alive!

判断181死亡 182 为primary 节点

[root@db-181 ~]# /etc/init.d/drbd start
Starting DRBD resources: [ d(r0) s(r0) n(r0) ].
[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r—–
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-181 ~]#

181重启完毕之后 保持standby 较色 直接开启 heartbeat

[root@db-181 ~]# tail -f /var/log/ha-log

heartbeat[3395]: 2012/03/29_10:23:10 info: Status update for node db-182: status active
harc[3404]: 2012/03/29_10:23:10 info: Running /etc/ha.d/rc.d/status status
heartbeat[3395]: 2012/03/29_10:23:10 info: Comm_now_up(): updating status to active
heartbeat[3395]: 2012/03/29_10:23:10 info: Local status now set to: ‘active’
heartbeat[3395]: 2012/03/29_10:23:10 info: Starting child client “/usr/lib64/heartbeat/ipfail” (498,496)
heartbeat[3421]: 2012/03/29_10:23:10 info: Starting “/usr/lib64/heartbeat/ipfail” as uid 498 gid 496 (pid 3421)
heartbeat[3395]: 2012/03/29_10:23:11 info: remote resource transition completed.
heartbeat[3395]: 2012/03/29_10:23:11 info: remote resource transition completed.
heartbeat[3395]: 2012/03/29_10:23:11 info: Local Resource acquisition completed. (none)
heartbeat[3395]: 2012/03/29_10:23:11 info: Initial resource acquisition complete (T_RESOURCES(them))
ipfail[3421]: 2012/03/29_10:23:17 info: Ping node count is balanced.

没有出现任何问题

———————————————————————————————————

测试三: reboot primary 节点

reboot 182

181 日志如下:


heartbeat[3395]: 2012/03/29_10:24:49 info: Link db-182:eth1 dead.
ipfail[3421]: 2012/03/29_10:24:49 info: Link Status update: Link db-182/eth1 now has status dead
ipfail[3421]: 2012/03/29_10:24:51 info: Asking other side for ping node count.
ipfail[3421]: 2012/03/29_10:24:51 info: Checking remote count of ping nodes.
heartbeat[3395]: 2012/03/29_10:25:14 WARN: node db-182: is dead
heartbeat[3395]: 2012/03/29_10:25:14 WARN: No STONITH device configured.
heartbeat[3395]: 2012/03/29_10:25:14 WARN: Shared disks are not protected.
heartbeat[3395]: 2012/03/29_10:25:14 info: Resources being acquired from db-182.
ipfail[3421]: 2012/03/29_10:25:14 info: Status update: Node db-182 now has status dead
harc[3429]: 2012/03/29_10:25:14 info: Running /etc/ha.d/rc.d/status status
heartbeat[3430]: 2012/03/29_10:25:14 info: No local resources [/usr/share/heartbeat/ResourceManager listkeys db-181] to acquire.
mach_down[3458]: 2012/03/29_10:25:14 info: Taking over resource group drbddisk::r0
ResourceManager[3484]: 2012/03/29_10:25:14 info: Acquiring resource group: db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0
ResourceManager[3484]: 2012/03/29_10:25:14 info: Running /etc/ha.d/resource.d/drbddisk r0 start
Filesystem[3555]: 2012/03/29_10:25:15 INFO: Resource is stopped
ResourceManager[3484]: 2012/03/29_10:25:15 info: Running /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data ext3 start
Filesystem[3636]: 2012/03/29_10:25:15 INFO: Running start for /dev/drbd0 on /data
Filesystem[3625]: 2012/03/29_10:25:15 INFO: Success
ResourceManager[3484]: 2012/03/29_10:25:15 info: Running /etc/init.d/mysql start
ipfail[3421]: 2012/03/29_10:25:15 info: NS: We are still alive!
IPaddr[4028]: 2012/03/29_10:25:17 INFO: Resource is stopped
ResourceManager[3484]: 2012/03/29_10:25:17 info: Running /etc/ha.d/resource.d/IPaddr 10.2.0.250/24/eth0 start
IPaddr[4126]: 2012/03/29_10:25:17 INFO: Using calculated netmask for 10.2.0.250: 255.255.255.0
IPaddr[4126]: 2012/03/29_10:25:17 INFO: eval ifconfig eth0:0 10.2.0.250 netmask 255.255.255.0 broadcast 10.2.0.255
IPaddr[4097]: 2012/03/29_10:25:17 INFO: Success
mach_down[3458]: 2012/03/29_10:25:17 info: /usr/share/heartbeat/mach_down: nice_failback: foreign resources acquired
mach_down[3458]: 2012/03/29_10:25:17 info: mach_down takeover complete for node db-182.
heartbeat[3395]: 2012/03/29_10:25:17 info: mach_down takeover complete.

[root@db-181 ~]# df -k
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda3 49594228 1803584 45230688 4% /
/dev/sda2 49594228 184276 46849996 1% /home
/dev/sda1 194442 12183 172220 7% /boot
tmpfs 4082228 0 4082228 0% /dev/shm
/dev/drbd0 154580772 708924 146019520 1% /data
[root@db-181 ~]# ps -ef |grep mysql
root 3737 1 0 10:25 ? 00:00:00 /bin/sh /usr/bin/mysqld_safe –datadir=/data/mysql –pid-file=/data/mysql/db-181.pid
mysql 3988 3737 0 10:25 ? 00:00:00 /usr/sbin/mysqld –basedir=/usr –datadir=/data/mysql –plugin-dir=/usr/lib64/mysql/plugin –user=mysql –log-error=/var/log/mysqld.log –pid-file=/data/mysql/db-181.pid –socket=/data/mysql/mysql.sock –port=3306
root 4240 3078 0 10:25 pts/1 00:00:00 grep mysql
[root@db-181 ~]#

[root@db-181 ~]# ifconfig -a
eth0 Link encap:Ethernet HWaddr D4:AE:52:B9:6D:D0
inet addr:10.2.0.181 Bcast:10.2.0.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
RX packets:2307 errors:0 dropped:0 overruns:0 frame:0
TX packets:464 errors:0 dropped:0 overruns:0 carrier:0
collisions:0 txqueuelen:1000
RX bytes:281396 (274.8 KiB) TX bytes:62148 (60.6 KiB)
Interrupt:169 Memory:c0000000-c0012800

eth0:0 Link encap:Ethernet HWaddr D4:AE:52:B9:6D:D0
inet addr:10.2.0.250 Bcast:10.2.0.255 Mask:255.255.255.0
UP BROADCAST RUNNING MULTICAST MTU:1500 Metric:1
Interrupt:169 Memory:c0000000-c0012800

所有资源切换到181 上

[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:WFConnection ro:Primary/Unknown ds:UpToDate/DUnknown C r—–
ns:0 nr:116 dw:432 dr:22465 al:10 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:260
[root@db-181 ~]#

角色变为 primary

[root@db-182 ~]# /etc/init.d/drbd start
Starting DRBD resources: [ d(r0) s(r0) n(r0) ].
[root@db-182 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:StandAlone ro:Secondary/Unknown ds:UpToDate/DUnknown r—–
ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:32

出现unknow 状态 查看182 日志


Mar 29 10:29:45 db-182 kernel: block drbd0: uuid_compare()=100 by rule 90
Mar 29 10:29:45 db-182 kernel: block drbd0: helper command: /sbin/drbdadm initial-split-brain minor-0
Mar 29 10:29:45 db-182 kernel: block drbd0: helper command: /sbin/drbdadm initial-split-brain minor-0 exit code 0 (0x0)
Mar 29 10:29:45 db-182 kernel: block drbd0: Split-Brain detected but unresolved, dropping connection!
Mar 29 10:29:45 db-182 kernel: block drbd0: helper command: /sbin/drbdadm split-brain minor-0
Mar 29 10:29:45 db-182 kernel: block drbd0: meta connection shut down by peer.
Mar 29 10:29:45 db-182 kernel: block drbd0: conn( WFReportParams -> NetworkFailure )
Mar 29 10:29:45 db-182 kernel: block drbd0: asender terminated
Mar 29 10:29:45 db-182 kernel: block drbd0: Terminating asender thread
Mar 29 10:29:45 db-182 kernel: block drbd0: helper command: /sbin/drbdadm split-brain minor-0 exit code 0 (0x0)
Mar 29 10:29:45 db-182 kernel: block drbd0: conn( NetworkFailure -> Disconnecting )
Mar 29 10:29:45 db-182 kernel: block drbd0: error receiving ReportState, l: 4!
Mar 29 10:29:45 db-182 kernel: block drbd0: Connection closed
Mar 29 10:29:45 db-182 kernel: block drbd0: conn( Disconnecting -> StandAlone )
Mar 29 10:29:45 db-182 kernel: block drbd0: receiver terminated
Mar 29 10:29:45 db-182 kernel: block drbd0: Terminating receiver thread


出现了脑裂 ,需要人工干预

[root@db-182 ~]# drbdadm — –discard-my-data connect r0
[root@db-181 ~]# drbdadm connect all

[root@db-182 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r—–
ns:0 nr:260 dw:260 dr:0 al:0 bm:4 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-182 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r—–
ns:0 nr:260 dw:260 dr:0 al:0 bm:4 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-182 ~]#

[root@db-182 ~]# service heartbeat start
Starting High-Availability services:
[ OK ]
[root@db-182 ~]# df -k
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda3 49594228 2015660 45018612 5% /
/dev/sda2 49594228 184276 46849996 1% /home
/dev/sda1 194442 12183 172220 7% /boot
tmpfs 4082228 0 4082228 0% /dev/shm

查看181 日志

heartbeat[3395]: 2012/03/29_11:03:25 info: Heartbeat restart on node db-182
heartbeat[3395]: 2012/03/29_11:03:25 info: Link db-182:eth1 up.
heartbeat[3395]: 2012/03/29_11:03:25 info: Status update for node db-182: status init
heartbeat[3395]: 2012/03/29_11:03:25 info: Status update for node db-182: status up
ipfail[3421]: 2012/03/29_11:03:25 info: Link Status update: Link db-182/eth1 now has status up
ipfail[3421]: 2012/03/29_11:03:25 info: Status update: Node db-182 now has status init
ipfail[3421]: 2012/03/29_11:03:25 info: Status update: Node db-182 now has status up
harc[4345]: 2012/03/29_11:03:25 info: Running /etc/ha.d/rc.d/status status
harc[4361]: 2012/03/29_11:03:25 info: Running /etc/ha.d/rc.d/status status
heartbeat[3395]: 2012/03/29_11:03:26 info: Status update for node db-182: status active
ipfail[3421]: 2012/03/29_11:03:26 info: Status update: Node db-182 now has status active
harc[4377]: 2012/03/29_11:03:26 info: Running /etc/ha.d/rc.d/status status
heartbeat[3395]: 2012/03/29_11:03:26 info: remote resource transition completed.
ipfail[3421]: 2012/03/29_11:03:27 info: Asking other side for ping node count.
ipfail[3421]: 2012/03/29_11:03:33 info: No giveup timer to abort.


181 继续接管资源

———————————————————————————————————

测试四 断开心跳线 primary 节点:

[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r—–
ns:260 nr:0 dw:432 dr:22725 al:10 bm:5 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0

[root@db-181 ~]# ifdown eth1

db-182 日志 :


Mar 29 11:32:47 db-182 heartbeat: [3517]: info: Link db-181:eth1 dead.
Mar 29 11:32:47 db-182 ipfail: [3543]: info: Link Status update: Link db-181/eth1 now has status dead
Mar 29 11:32:49 db-182 ipfail: [3543]: info: Asking other side for ping node count.
Mar 29 11:32:49 db-182 ipfail: [3543]: info: Checking remote count of ping nodes.
Mar 29 11:32:52 db-182 kernel: block drbd0: PingAck did not arrive in time.
Mar 29 11:32:52 db-182 kernel: block drbd0: peer( Primary -> Unknown ) conn( Connected -> NetworkFailure ) pdsk( UpToDate -> DUnknown )
Mar 29 11:32:52 db-182 kernel: block drbd0: asender terminated
Mar 29 11:32:52 db-182 kernel: block drbd0: Terminating asender thread
Mar 29 11:32:52 db-182 kernel: block drbd0: Connection closed
Mar 29 11:32:52 db-182 kernel: block drbd0: conn( NetworkFailure -> Unconnected )
Mar 29 11:32:52 db-182 kernel: block drbd0: receiver terminated
Mar 29 11:32:52 db-182 kernel: block drbd0: Restarting receiver thread
Mar 29 11:32:52 db-182 kernel: block drbd0: receiver (re)started
Mar 29 11:32:52 db-182 kernel: block drbd0: conn( Unconnected -> WFConnection )
Mar 29 11:33:12 db-182 heartbeat: [3517]: WARN: node db-181: is dead
Mar 29 11:33:12 db-182 heartbeat: [3517]: WARN: No STONITH device configured.
Mar 29 11:33:12 db-182 heartbeat: [3517]: WARN: Shared disks are not protected.
Mar 29 11:33:12 db-182 heartbeat: [3517]: info: Resources being acquired from db-181.
Mar 29 11:33:12 db-182 ipfail: [3543]: info: Status update: Node db-181 now has status dead
Mar 29 11:33:12 db-182 harc[3606]: info: Running /etc/ha.d/rc.d/status status
Mar 29 11:33:12 db-182 heartbeat: [3607]: info: Local Resource acquisition completed.
Mar 29 11:33:12 db-182 mach_down[3636]: info: /usr/share/heartbeat/mach_down: nice_failback: foreign resources acquired
Mar 29 11:33:12 db-182 mach_down[3636]: info: mach_down takeover complete for node db-181.
Mar 29 11:33:12 db-182 heartbeat: [3517]: info: mach_down takeover complete.
Mar 29 11:33:12 db-182 harc[3699]: info: Running /etc/ha.d/rc.d/ip-request-resp ip-request-resp
Mar 29 11:33:13 db-182 ip-request-resp[3699]: received ip-request-resp drbddisk::r0 OK yes
Mar 29 11:33:13 db-182 ResourceManager[3720]: info: Acquiring resource group: db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0
Mar 29 11:33:13 db-182 ResourceManager[3720]: info: Running /etc/ha.d/resource.d/drbddisk r0 start
Mar 29 11:33:13 db-182 kernel: block drbd0: role( Secondary -> Primary )
Mar 29 11:33:13 db-182 kernel: block drbd0: new current UUID DFA1CB9AD5B93F59:EC913DF5B2397852:A13E0488362E386E:A13D0488362E386E
Mar 29 11:33:13 db-182 Filesystem[3791]: INFO: Resource is stopped
Mar 29 11:33:13 db-182 ResourceManager[3720]: info: Running /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data ext3 start
Mar 29 11:33:13 db-182 Filesystem[3872]: INFO: Running start for /dev/drbd0 on /data
Mar 29 11:33:13 db-182 ipfail: [3543]: info: NS: We are still alive!
Mar 29 11:33:13 db-182 kernel: kjournald starting. Commit interval 5 seconds
Mar 29 11:33:13 db-182 kernel: EXT3-fs warning: maximal mount count reached, running e2fsck is recommended
Mar 29 11:33:13 db-182 kernel: EXT3 FS on drbd0, internal journal
Mar 29 11:33:13 db-182 kernel: EXT3-fs: recovery complete.
Mar 29 11:33:13 db-182 kernel: EXT3-fs: mounted filesystem with ordered data mode.
Mar 29 11:33:13 db-182 Filesystem[3861]: INFO: Success

db-181 日志:

heartbeat[3399]: 2012/03/29_11:31:35 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:35 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:37 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:37 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:39 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:39 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3395]: 2012/03/29_11:31:40 info: Link db-182:eth1 dead.
ipfail[3421]: 2012/03/29_11:31:40 info: Link Status update: Link db-182/eth1 now has status dead
heartbeat[3399]: 2012/03/29_11:31:41 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:41 ERROR: write_child: write failure on ucast eth1.: No such device
ipfail[3421]: 2012/03/29_11:31:41 info: Asking other side for ping node count.
ipfail[3421]: 2012/03/29_11:31:41 info: Checking remote count of ping nodes.
heartbeat[3399]: 2012/03/29_11:31:42 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:42 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:43 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:43 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:45 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:45 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:45 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:45 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:47 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:47 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:49 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:49 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:51 ERROR: glib: Unable to send [-1] ucast packet: No such device
heartbeat[3399]: 2012/03/29_11:31:51 ERROR: write_child: write failure on ucast eth1.: No such device
heartbeat[3399]: 2012/03/29_11:31:51 WARN: Temporarily Suppressing write error messages
heartbeat[3399]: 2012/03/29_11:31:51 WARN: Is a cable unplugged on ucast eth1?
heartbeat[3395]: 2012/03/29_11:32:05 WARN: node db-182: is dead
heartbeat[3395]: 2012/03/29_11:32:05 info: Dead node db-182 gave up resources.
ipfail[3421]: 2012/03/29_11:32:05 info: Status update: Node db-182 now has status dead
ipfail[3421]: 2012/03/29_11:32:07 info: NS: We are still alive!

[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:StandAlone ro:Primary/Unknown ds:UpToDate/DUnknown r—–
ns:260 nr:0 dw:432 dr:22725 al:10 bm:5 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-181 ~]#

[root@db-182 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:WFConnection ro:Primary/Unknown ds:UpToDate/DUnknown C r—–
ns:0 nr:260 dw:580 dr:22313 al:8 bm:4 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:240
[root@db-182 ~]#

出现了脑裂 此时 182 182 同时mount 了 /data 目录 数据可能会受到影响

第三方机器连接测试:

[root@localhost ~]# ssh 10.2.0.250
The authenticity of host ‘10.2.0.250 (10.2.0.250)’ can’t be established.
RSA key fingerprint is 96:d1:c8:08:64:a0:fb:5a:45:44:48:be:42:e0:df:1d.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added ‘10.2.0.250’ (RSA) to the list of known hosts.
root@10.2.0.250’s password:
Last login: Thu Mar 29 11:35:15 2012 from 192.168.16.87
[root@db-182 ~]#

连接到了182 说明VIP 漂移到了182 此时数据都写入了 182的disk 需要人工回复这次故障:

[root@db-181 ~]# ifup eth1

181 日志 :


heartbeat[3395]: 2012/03/29_11:42:32 CRIT: Cluster node db-182 returning after partition.
heartbeat[3395]: 2012/03/29_11:42:32 info: For information on cluster partitions, See URL: http://linux-ha.org/SplitBrain
heartbeat[3395]: 2012/03/29_11:42:32 WARN: Deadtime value may be too small.
heartbeat[3395]: 2012/03/29_11:42:32 info: See FAQ for information on tuning deadtime.
heartbeat[3395]: 2012/03/29_11:42:32 info: URL: http://linux-ha.org/FAQ#heavy_load
heartbeat[3395]: 2012/03/29_11:42:32 info: Link db-182:eth1 up.
heartbeat[3395]: 2012/03/29_11:42:32 WARN: Late heartbeat: Node db-182: interval 657640 ms
heartbeat[3395]: 2012/03/29_11:42:32 info: Status update for node db-182: status active
ipfail[3421]: 2012/03/29_11:42:32 info: Link Status update: Link db-182/eth1 now has status up
ipfail[3421]: 2012/03/29_11:42:32 info: Status update: Node db-182 now has status active
harc[4605]: 2012/03/29_11:42:32 info: Running /etc/ha.d/rc.d/status status
heartbeat[3395]: 2012/03/29_11:42:34 info: Heartbeat shutdown in progress. (3395)
heartbeat[4622]: 2012/03/29_11:42:34 info: Giving up all HA resources.
ResourceManager[4635]: 2012/03/29_11:42:34 info: Releasing resource group: db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0
ResourceManager[4635]: 2012/03/29_11:42:34 info: Running /etc/ha.d/resource.d/IPaddr 10.2.0.250/24/eth0 stop
IPaddr[4702]: 2012/03/29_11:42:34 INFO: ifconfig eth0:0 down
IPaddr[4673]: 2012/03/29_11:42:34 INFO: Success
ResourceManager[4635]: 2012/03/29_11:42:35 info: Running /etc/init.d/mysql stop
ResourceManager[4635]: 2012/03/29_11:42:38 info: Running /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data ext3 stop
Filesystem[4806]: 2012/03/29_11:42:38 INFO: Running stop for /dev/drbd0 on /data
Filesystem[4806]: 2012/03/29_11:42:38 INFO: Trying to unmount /data
Filesystem[4806]: 2012/03/29_11:42:38 INFO: unmounted /data successfully
Filesystem[4795]: 2012/03/29_11:42:38 INFO: Success
ResourceManager[4635]: 2012/03/29_11:42:38 info: Running /etc/ha.d/resource.d/drbddisk r0 stop
heartbeat[4622]: 2012/03/29_11:42:38 info: All HA resources relinquished.
heartbeat[3395]: 2012/03/29_11:42:39 info: killing /usr/lib64/heartbeat/ipfail process group 3421 with signal 15
heartbeat[3395]: 2012/03/29_11:42:39 info: Received shutdown notice from ‘db-182’.
heartbeat[3395]: 2012/03/29_11:42:39 info: Resource takeover cancelled – shutdown in progress.
heartbeat[3395]: 2012/03/29_11:42:41 info: killing HBWRITE process 3399 with signal 15
heartbeat[3395]: 2012/03/29_11:42:41 info: killing HBREAD process 3400 with signal 15
heartbeat[3395]: 2012/03/29_11:42:41 info: killing HBWRITE process 3401 with signal 15
heartbeat[3395]: 2012/03/29_11:42:41 info: killing HBREAD process 3402 with signal 15
heartbeat[3395]: 2012/03/29_11:42:41 info: killing HBFIFO process 3398 with signal 15
heartbeat[3395]: 2012/03/29_11:42:41 info: Core process 3402 exited. 5 remaining
heartbeat[3395]: 2012/03/29_11:42:41 info: Core process 3401 exited. 4 remaining
heartbeat[3395]: 2012/03/29_11:42:41 info: Core process 3400 exited. 3 remaining
heartbeat[3395]: 2012/03/29_11:42:41 info: Core process 3399 exited. 2 remaining
heartbeat[3395]: 2012/03/29_11:42:41 info: Core process 3398 exited. 1 remaining
heartbeat[3395]: 2012/03/29_11:42:41 info: db-181 Heartbeat shutdown complete.
heartbeat[3395]: 2012/03/29_11:42:41 info: Heartbeat restart triggered.
heartbeat[3395]: 2012/03/29_11:42:41 info: Restarting heartbeat.
heartbeat[3395]: 2012/03/29_11:42:41 info: Performing heartbeat restart exec.
heartbeat[3395]: 2012/03/29_11:43:12 info: Version 2 support: false
heartbeat[3395]: 2012/03/29_11:43:12 WARN: Logging daemon is disabled –enabling logging daemon is recommended
heartbeat[3395]: 2012/03/29_11:43:12 info: **************************
heartbeat[3395]: 2012/03/29_11:43:12 info: Configuration validated. Starting heartbeat 2.1.3
heartbeat[4905]: 2012/03/29_11:43:12 info: heartbeat: version 2.1.3
heartbeat[4905]: 2012/03/29_11:43:12 info: Heartbeat generation: 1332819174
heartbeat[4905]: 2012/03/29_11:43:12 info: glib: ucast: write socket priority set to IPTOS_LOWDELAY on eth1
heartbeat[4905]: 2012/03/29_11:43:12 info: glib: ucast: bound send socket to device: eth1
heartbeat[4905]: 2012/03/29_11:43:12 info: glib: ucast: bound receive socket to device: eth1
heartbeat[4905]: 2012/03/29_11:43:12 info: glib: ucast: started on port 694 interface eth1 to 192.168.0.182
heartbeat[4905]: 2012/03/29_11:43:12 info: glib: ping heartbeat started.
heartbeat[4905]: 2012/03/29_11:43:12 info: G_main_add_TriggerHandler: Added signal manual handler
heartbeat[4905]: 2012/03/29_11:43:12 info: G_main_add_TriggerHandler: Added signal manual handler
heartbeat[4905]: 2012/03/29_11:43:12 info: G_main_add_SignalHandler: Added signal handler for signal 17
heartbeat[4905]: 2012/03/29_11:43:12 info: Local status now set to: ‘up’
heartbeat[4905]: 2012/03/29_11:43:13 info: Link 10.0.0.58:10.0.0.58 up.
heartbeat[4905]: 2012/03/29_11:43:13 info: Status update for node 10.0.0.58: status ping
heartbeat[4905]: 2012/03/29_11:43:15 info: Link db-182:eth1 up.
heartbeat[4905]: 2012/03/29_11:43:15 info: Status update for node db-182: status up
harc[4913]: 2012/03/29_11:43:15 info: Running /etc/ha.d/rc.d/status status
heartbeat[4905]: 2012/03/29_11:43:15 info: Comm_now_up(): updating status to active
heartbeat[4905]: 2012/03/29_11:43:15 info: Local status now set to: ‘active’
heartbeat[4905]: 2012/03/29_11:43:15 info: Starting child client “/usr/lib64/heartbeat/ipfail” (498,496)
heartbeat[4905]: 2012/03/29_11:43:15 info: Status update for node db-182: status active
heartbeat[4930]: 2012/03/29_11:43:15 info: Starting “/usr/lib64/heartbeat/ipfail” as uid 498 gid 496 (pid 4930)
harc[4931]: 2012/03/29_11:43:15 info: Running /etc/ha.d/rc.d/status status
heartbeat[4905]: 2012/03/29_11:43:25 info: local resource transition completed.
heartbeat[4905]: 2012/03/29_11:43:25 info: Initial resource acquisition complete (T_RESOURCES(us))
ipfail[4930]: 2012/03/29_11:43:25 info: Ping node count is balanced.
heartbeat[4949]: 2012/03/29_11:43:25 info: No local resources [/usr/share/heartbeat/ResourceManager listkeys db-181] to acquire.
heartbeat[4905]: 2012/03/29_11:43:26 info: remote resource transition completed.


182 日志:

Mar 29 11:44:24 db-182 harc[4864]: info: Running /etc/ha.d/rc.d/status status
Mar 29 11:44:30 db-182 ipfail: [4861]: info: Status update: Node db-181 now has status active
Mar 29 11:44:32 db-182 ipfail: [4861]: info: Asking other side for ping node count.
Mar 29 11:44:34 db-182 heartbeat: [4836]: info: remote resource transition completed.
Mar 29 11:44:34 db-182 heartbeat: [4836]: info: remote resource transition completed.
Mar 29 11:44:34 db-182 heartbeat: [4836]: info: Initial resource acquisition complete (T_RESOURCES(us))
Mar 29 11:44:34 db-182 ipfail: [4861]: info: No giveup timer to abort.
Mar 29 11:44:34 db-182 heartbeat: [4880]: info: Local Resource acquisition completed.
Mar 29 11:44:34 db-182 harc[4923]: info: Running /etc/ha.d/rc.d/ip-request-resp ip-request-resp
Mar 29 11:44:34 db-182 ip-request-resp[4923]: received ip-request-resp drbddisk::r0 OK yes
Mar 29 11:44:34 db-182 ResourceManager[4944]: info: Acquiring resource group: db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0
Mar 29 11:44:34 db-182 ResourceManager[4944]: info: Running /etc/ha.d/resource.d/drbddisk r0 start
Mar 29 11:44:34 db-182 kernel: block drbd0: role( Secondary -> Primary )
Mar 29 11:44:34 db-182 Filesystem[5015]: INFO: Resource is stopped
Mar 29 11:44:34 db-182 ResourceManager[4944]: info: Running /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data ext3 start
Mar 29 11:44:34 db-182 Filesystem[5096]: INFO: Running start for /dev/drbd0 on /data
Mar 29 11:44:35 db-182 kernel: kjournald starting. Commit interval 5 seconds
Mar 29 11:44:35 db-182 kernel: EXT3-fs warning: maximal mount count reached, running e2fsck is recommended
Mar 29 11:44:35 db-182 kernel: EXT3 FS on drbd0, internal journal
Mar 29 11:44:35 db-182 kernel: EXT3-fs: mounted filesystem with ordered data mode.
Mar 29 11:44:35 db-182 Filesystem[5085]: INFO: Success
Mar 29 11:44:35 db-182 ResourceManager[4944]: info: Running /etc/init.d/mysql start
Mar 29 11:44:37 db-182 IPaddr[5488]: INFO: Resource is stopped
Mar 29 11:44:37 db-182 ResourceManager[4944]: info: Running /etc/ha.d/resource.d/IPaddr 10.2.0.250/24/eth0 start
Mar 29 11:44:37 db-182 IPaddr[5586]: INFO: Using calculated netmask for 10.2.0.250: 255.255.255.0
Mar 29 11:44:37 db-182 IPaddr[5586]: INFO: eval ifconfig eth0:0 10.2.0.250 netmask 255.255.255.0 broadcast 10.2.0.255
Mar 29 11:44:37 db-182 avahi-daemon[3239]: Registering new address record for 10.2.0.250 on eth0.
Mar 29 11:44:37 db-182 avahi-daemon[3239]: Withdrawing address record for 10.2.0.250 on eth0.
Mar 29 11:44:37 db-182 avahi-daemon[3239]: Registering new address record for 10.2.0.250 on eth0.
Mar 29 11:44:37 db-182 IPaddr[5557]: INFO: Success

[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:StandAlone ro:Secondary/Unknown ds:UpToDate/DUnknown r—–
ns:260 nr:0 dw:592 dr:22725 al:10 bm:5 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:148
[root@db-181 ~]#

[root@db-182 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:WFConnection ro:Primary/Unknown ds:UpToDate/DUnknown C r—–
ns:0 nr:260 dw:976 dr:44366 al:10 bm:4 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:360
[root@db-182 ~]#

出现脑裂 需要指定 182 为primary

[root@db-181 ~]# drbdadm — –discard-my-data connect r0
[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r—–
ns:0 nr:380 dw:972 dr:22725 al:10 bm:8 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-181 ~]#

[root@db-182 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r—–
ns:380 nr:260 dw:976 dr:44746 al:10 bm:8 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-182 ~]#

这种情况 出现的问题需要人工干预

测试五 断开心跳线 primary 节点 (另外一种情况)

[root@db-182 ~]# cat /etc/ha.d/haresources
db-182 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0

目前指定的主节点为 db-182 现在的primary也为182 如果ifdown eth1 那么VIP 会漂移到db-181上 此时去连接 也会连接到db-181上 如果这时候drbd 没有同步完成
会丢失部分数据 另外db-182 eth1 恢复后 primary会重新被db-182接管 可能会导致灾难 因为故障时间里的数据都写在了db-181的盘上 此时需要修改 haresources设置主节点为
db-181 ifup eth1 之后 db-181仍然为primary node.

———————————————————————————————————

测试六: 断开 standby 节点心跳:

[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r—–
ns:396 nr:384 dw:3392 dr:112101 al:14 bm:19 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0

[root@db-182 ~]# ifdown eth1

[root@db-182 ~]# df -k
Filesystem 1K-blocks Used Available Use% Mounted on
/dev/sda3 49594228 2017708 45016564 5% /
/dev/sda2 49594228 184276 46849996 1% /home
/dev/sda1 194442 12183 172220 7% /boot
tmpfs 4082228 0 4082228 0% /dev/shm
/dev/drbd0 154580772 708952 146019492 1% /data
[root@db-182 ~]#

同样出现了脑裂 182认为181死掉了 自己挂载了/data

[root@localhost ~]# ssh 10.2.0.250
The authenticity of host ‘10.2.0.250 (10.2.0.250)’ can’t be established.
RSA key fingerprint is 96:d1:c8:08:64:a0:fb:5a:45:44:48:be:42:e0:df:1d.
Are you sure you want to continue connecting (yes/no)? yes
Warning: Permanently added ‘10.2.0.250’ (RSA) to the list of known hosts.
root@10.2.0.250’s password:
Last login: Thu Mar 29 12:37:30 2012 from 10.2.0.183
[root@db-182 ~]#

此时连接到了182上

后面的情况跟primary 心跳断开一样 需要人为确定primary端 这个时候就存在了风险

总结下心跳线出现问题 可能导致的问题:

1、如果心跳网络和ping server是同一个网络,那么如果心跳网络有问题,会有什么现象?如何解决?
现象:双方都收不到心跳信号,认为对方已死,所以导致脑裂
解决:
方法1: 双心跳网络。

方法2:添加硬件的stonith(fence)。
原理:由一开始,就是由主机器发送心跳给备机器,心跳网络出问题,备机器认为主机器死了,然后备机器会发指令让stonith设备重启或者关闭主机器,方向是单向,只能备机器关闭主机器。

2、如果ping server是处于生产网络上的,而心跳网络现在出问题。又会现出什么现象?如何解决?
跟上面的结果是一样。解决方法也是一样。在这种情况,ping server不起作用。
结论:整个架构还是以心跳信号作为故障判断主要依据。
3、ping server处于生产网络,而心跳网络没有问题,但生产网络出现故障。会出现什么现象?

ping server 到底是什么一个角色?

作用: 辅助判断是否是出现网络故障。用到的插件ipfail.

如果节点node1与ping server失去联系(生产网络故障),该节点node1就会通过心跳网络问另外一个节点node2是否与ping server通信正常。如果另外一个节点node2能够通信正常,那么该节点node1就知道自己出现网络故障,就会把资源主动让给另一个节点node2

断开eth0 待测 ….

——-
补充 断开eth0 测试:
[root@db-181 ~]# cat /proc/drbd
version: 8.3.12 (api:88/proto:86-96)
GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25
0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r—–
ns:1340 nr:420 dw:7932 dr:204162 al:17 bm:30 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0
[root@db-181 ~]#

目前 181为主节点 换开181的public ip

heartbeat[1334]: 2012/04/06_14:50:32 ERROR: glib: Error sending packet: Network is unreachable
heartbeat[1334]: 2012/04/06_14:50:32 info: glib: euid=0 egid=0
heartbeat[1334]: 2012/04/06_14:50:32 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable
heartbeat[1334]: 2012/04/06_14:50:34 ERROR: glib: Error sending packet: Network is unreachable
heartbeat[1334]: 2012/04/06_14:50:34 info: glib: euid=0 egid=0
heartbeat[1334]: 2012/04/06_14:50:34 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable
heartbeat[1329]: 2012/04/06_14:50:35 WARN: node 10.0.0.58: is dead
ipfail[1354]: 2012/04/06_14:50:35 info: Status update: Node 10.0.0.58 now has status dead
heartbeat[1329]: 2012/04/06_14:50:35 info: Link 10.0.0.58:10.0.0.58 dead.
harc[6836]: 2012/04/06_14:50:35 info: Running /etc/ha.d/rc.d/status status
ipfail[1354]: 2012/04/06_14:50:36 info: NS: We are dead. :< ipfail[1354]: 2012/04/06_14:50:36 info: Link Status update: Link 10.0.0.58/10.0.0.58 now has status dead heartbeat[1334]: 2012/04/06_14:50:36 ERROR: glib: Error sending packet: Network is unreachable heartbeat[1334]: 2012/04/06_14:50:36 info: glib: euid=0 egid=0 heartbeat[1334]: 2012/04/06_14:50:36 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable ipfail[1354]: 2012/04/06_14:50:37 info: We are dead. :< ipfail[1354]: 2012/04/06_14:50:37 info: Asking other side for ping node count. heartbeat[1334]: 2012/04/06_14:50:38 ERROR: glib: Error sending packet: Network is unreachable heartbeat[1334]: 2012/04/06_14:50:38 info: glib: euid=0 egid=0 heartbeat[1334]: 2012/04/06_14:50:38 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable ipfail[1354]: 2012/04/06_14:50:40 info: Giving up because we were told that we have less ping nodes. ipfail[1354]: 2012/04/06_14:50:40 info: Delayed giveup in 4 seconds. heartbeat[1334]: 2012/04/06_14:50:40 ERROR: glib: Error sending packet: Network is unreachable heartbeat[1334]: 2012/04/06_14:50:40 info: glib: euid=0 egid=0 heartbeat[1334]: 2012/04/06_14:50:40 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable heartbeat[1334]: 2012/04/06_14:50:42 ERROR: glib: Error sending packet: Network is unreachable heartbeat[1334]: 2012/04/06_14:50:42 info: glib: euid=0 egid=0 heartbeat[1334]: 2012/04/06_14:50:42 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable ipfail[1354]: 2012/04/06_14:50:44 info: giveup() called (timeout worked) heartbeat[1334]: 2012/04/06_14:50:44 ERROR: glib: Error sending packet: Network is unreachable heartbeat[1329]: 2012/04/06_14:50:44 info: db-181 wants to go standby [all] heartbeat[1334]: 2012/04/06_14:50:44 info: glib: euid=0 egid=0 heartbeat[1334]: 2012/04/06_14:50:44 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable heartbeat[1329]: 2012/04/06_14:50:45 info: standby: db-182 can take our all resources heartbeat[6861]: 2012/04/06_14:50:45 info: give up all HA resources (standby). ResourceManager[6874]: 2012/04/06_14:50:45 info: Releasing resource group: db-181 drbddisk::r0 Filesystem::/dev/drbd0::/data::ext3 mysql IPaddr::10.2.0.250/24/eth0 ResourceManager[6874]: 2012/04/06_14:50:45 info: Running /etc/ha.d/resource.d/IPaddr 10.2.0.250/24/eth0 stop IPaddr[6912]: 2012/04/06_14:50:45 INFO: Success ResourceManager[6874]: 2012/04/06_14:50:45 info: Running /etc/init.d/mysql stop ResourceManager[6874]: 2012/04/06_14:50:46 info: Running /etc/ha.d/resource.d/Filesystem /dev/drbd0 /data ext3 stop Filesystem[7033]: 2012/04/06_14:50:46 INFO: Running stop for /dev/drbd0 on /data Filesystem[7033]: 2012/04/06_14:50:46 INFO: Trying to unmount /data Filesystem[7033]: 2012/04/06_14:50:46 INFO: unmounted /data successfully Filesystem[7022]: 2012/04/06_14:50:46 INFO: Success ResourceManager[6874]: 2012/04/06_14:50:46 info: Running /etc/ha.d/resource.d/drbddisk r0 stop heartbeat[6861]: 2012/04/06_14:50:46 info: all HA resource release completed (standby). heartbeat[1329]: 2012/04/06_14:50:46 info: Local standby process completed [all]. heartbeat[1334]: 2012/04/06_14:50:46 ERROR: glib: Error sending packet: Network is unreachable heartbeat[1334]: 2012/04/06_14:50:46 info: glib: euid=0 egid=0 heartbeat[1334]: 2012/04/06_14:50:46 ERROR: write_child: write failure on ping 10.0.0.58.: Network is unreachable heartbeat[1334]: 2012/04/06_14:50:48 ERROR: glib: Error sending packet: Network is unreachable 181 判断自己dead 并且释放资源 [root@db-181 ~]# cat /proc/drbd version: 8.3.12 (api:88/proto:86-96) GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25 0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r----- ns:1500 nr:668 dw:8340 dr:204162 al:17 bm:30 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0 [root@db-181 ~]# DRBD 角色变为 Secondary 现在恢复eth0 heartbeat[1329]: 2012/04/06_14:57:03 info: Link 10.0.0.58:10.0.0.58 up. heartbeat[1329]: 2012/04/06_14:57:03 WARN: Late heartbeat: Node 10.0.0.58: interval 392100 ms heartbeat[1329]: 2012/04/06_14:57:03 info: Status update for node 10.0.0.58: status ping ipfail[1354]: 2012/04/06_14:57:03 info: Link Status update: Link 10.0.0.58/10.0.0.58 now has status up ipfail[1354]: 2012/04/06_14:57:03 info: Status update: Node 10.0.0.58 now has status ping ipfail[1354]: 2012/04/06_14:57:03 info: A ping node just came up. ipfail[1354]: 2012/04/06_14:57:04 info: Asking other side for ping node count. ipfail[1354]: 2012/04/06_14:57:07 info: No giveup timer to abort. [root@db-181 ~]# cat /proc/drbd version: 8.3.12 (api:88/proto:86-96) GIT-hash: e2a8ef4656be026bbae540305fcb998a5991090f build by mockbuild@builder10.centos.org, 2012-01-28 13:52:25 0: cs:Connected ro:Secondary/Primary ds:UpToDate/UpToDate C r----- ns:1500 nr:668 dw:8340 dr:204162 al:17 bm:30 lo:0 pe:0 ua:0 ap:0 ep:1 wo:b oos:0 [root@db-181 ~]# 依然保持Secondary角色