作者简介
谭峰
网名francs,PostgreSQL中文社区委员,《PostgreSQL实战》作者之一,《PostgreSQL 9 Administration Cookbook》译者之一,致力于PostgreSQL技术分享,博客 https://postgres.fun,三墩IT人。
故障现象
数据库版本: PostgreSQL 11.2
数据库大小: 大于1TB
操作系统: CentOS 7.4
硬件环境: HPE DL560 gen10 + 全闪存(3par 8440)
其它信息: 单实例,配置了逻辑复制
排查过程
archive_mode = on # enables archiving; off, on, or always
# (change requires restart)
archive_command = 'cp --backup %p /log/archive_log/%f' # command to use to archive a logfile segment
wal_keep_segments = 0 # in logfile segments; 0 disables
checkpoint_timeout = 5min # range 30s-1d
模拟WAL目录膨胀
环境规划
环境准备
mydb=> CREATE TABLE user1 (userid int4, username character varying(32),regtime timestamp without time zone);
CREATE TABLE
mydb=> INSERT INTO user1 (userid,username,regtime) SELECT n, 'user' || n, now() FROM generate_series(1,10000000) n;
INSERT 10000000
mydb=> ALTER TABLE user1 ADD PRIMARY KEY(userid);
ALTER TABLE
mydb=> show wal_keep_segments ;
wal_keep_segments
-------------------
4
(1 row)
[pg13@ydtf01 pg_wal]$ ls $PGDATA/pg_wal | wc -l
57
mydb=> CREATE TABLE user1 (userid int4, username character varying(32),regtime timestamp without time zone);
CREATE TABLE
mydb=> ALTER TABLE user1 ADD PRIMARY KEY(userid);
ALTER TABLE
mydb=> CREATE PUBLICATION pub_user1 FOR TABLE user1 ;
CREATE PUBLICATION
CREATE SUBSCRIPTION sub_user1 CONNECTION 'host=192.168.2.11 port=1922 dbname=mydb user=repuser' PUBLICATION pub_user1;
源库压力测试
postgres=# select *from pg_replication_slots WHERE slot_name='sub_user1';
-[ RECORD 1 ]-------+-----------
slot_name | sub_user1
plugin | pgoutput
slot_type | logical
datoid | 16386
database | mydb
temporary | f
active | t
active_pid | 84420
xmin |
catalog_xmin | 549020
restart_lsn | 1/17E06270
confirmed_flush_lsn | 1/17E062A8
wal_status | normal
min_safe_lsn |
mydb=# ALTER SUBSCRIPTION sub_user1 DISABLE;
ALTER SUBSCRIPTION
postgres=# select *from pg_replication_slots WHERE slot_name='sub_user1';-[ RECORD 1 ]-------+-----------slot_name | sub_user1plugin | pgoutputslot_type | logicaldatoid | 16386database | mydbtemporary | factive | factive_pid |xmin |catalog_xmin | 549020restart_lsn | 1/17E06270confirmed_flush_lsn | 1/17E062A8wal_status | normalmin_safe_lsn |
\set v_id random(1,10000000)
UPDATE user1 SET username='updated'||:v_id WHERE userid=:v_id
[pg13@ydtf01 pg_wal]$ ls $PGDATA/pg_wal | wc -l
57
pgbench -n -Mprepared -c 4 -j 2 -T 120 -U pguser mydb -f tran1.sql > tran1.out 2>&1 &
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
57
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
57
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
69
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
69
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
73
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
80
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
81
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
86
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
88
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
91
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
91
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
95
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
96
[pg13@ydtf01 ~]$ ls $PGDATA/pg_wal| wc -l
应对措施