Dovecot (>= 2.3.15) is not properly replicating Expunge commands running in a containerised environment
A. Schulze
sca at andreasschulze.de
Thu Aug 5 14:03:35 EEST 2021
Hello *,
migrating a mailbox setup for several tens of thousands of customers to Docker containers running
on Debian VMs, we are currently observing Expunge commands not being replicated properly between two
Dovecot containers.
Delivering a new message into someones' inbox is replicated instantly via dsync (using TLS). However,
expunging a message from the very same inbox via
> doveadm expunge -u [users' email address] mailbox '*' header Message-ID [MID of the message in question]
is not replicated, since a
> doveadm fetch -u [users' email address] text header Message-ID [MID of the message in question]
still returns the mail on the replica. Sometimes, a
> doveadm replicator replicate [users' email address]
fixes this, but mostly, it does not. While doing all of this, these messages are logged by Dovecot:
> doveadm: Info: copy from INBOX: box=INBOX, uid=13, msgid=<[MID of the message in question]>, size=1793
> doveadm: Info: expunge: box=INBOX, uid=12, msgid=<[MID of the message in question]>, size=1793
> doveadm: Error: dsync(x): I/O has stalled, no activity for 600 seconds (last sent=handshake, last recv=handshake)
> doveadm: Error: Timeout during state=slave_recv_last_common
> doveadm: Error: dsync(x): I/O has stalled, no activity for 600 seconds (last sent=mailbox_delete, last recv=mailbox_delete)
> doveadm: Error: Timeout during state=slave_recv_mailbox
> doveadm: Error: dsync(SSL x): I/O has stalled, no activity for 600 seconds (version not received)
> doveadm: Error: Timeout during state=slave_recv_handshake
Unfortunately, these errors do not seem to be reproducible very well - the underlying behaviour however
is. Setting
> replication_sync_timeout = 2s
, as mentioned in https://doc.dovecot.org/settings/plugin/replication-plugin/ (the "s" seems to be bogus,
though, as Dovecot complains about an invalid timeframe given) results in these messages:
> doveadm(x): Error: doveadm client disconnected before handshake: Connection closed
> replicator: Panic: data stack: Out of memory when allocating 268435496 bytes
> replicator: Error: Raw backtrace: /usr/local/lib/dovecot/libdovecot.so.0(backtrace_append+0x3d) [0x7ff8f55343dd] -> /usr/local/lib/dovecot/libdovecot.so.0(backtrace_get+0x1e) [0x7ff8f55344fe] -> /usr/local/lib/dovecot/libdovecot.so.0(+0xfcc6b) [0x7ff8f5540c6b] -> /usr/local/lib/dovecot/libdovecot.so.0(+0xfcd01) [0x7ff8f5540d01] -> /usr/local/lib/dovecot/libdovecot.so.0(+0x533e7) [0x7ff8f54973e7] -> /usr/local/lib/dovecot/libdovecot.so.0(+0x52dcb) [0x7ff8f5496dcb] -> /usr/local/lib/dovecot/libdovecot.so.0(+0xf6625) [0x7ff8f553a625] -> /usr/local/lib/dovecot/libdovecot.so.0(+0x11c248) [0x7ff8f5560248] -> /usr/local/lib/dovecot/libdovecot.so.0(+0xf2655) [0x7ff8f5536655] -> /usr/local/lib/dovecot/libdovecot.so.0(buffer_write+0x67) [0x7ff8f5536867] -> dovecot/replicator(replicator_queue_push+0xe9) [0x5619b1b81a39] -> dovecot/replicator(+0x61f8) [0x5619b1b811f8] -> dovecot/replicator(+0x58ef) [0x5619b1b808ef] -> dovecot/replicator(+0x5ab2) [0x5619b1b80ab2] -> /usr/local/lib/dovecot/libdovecot.so.0(io_loop_call_io+0x69) [0x7ff8f5556b99] -> /usr/local/lib/dovecot/libdovecot.so.0(io_loop_handler_run_internal+0x131) [0x7ff8f5558101] -> /usr/local/lib/dovecot/libdovecot.so.0(io_loop_handler_run+0x4c) [0x7ff8f5556c3c] -> /usr/local/lib/dovecot/libdovecot.so.0(io_loop_run+0x40) [0x7ff8f5556db0] -> /usr/local/lib/dovecot/libdovecot.so.0(master_service_run+0x13) [0x7ff8f54cb7b3] -> dovecot/replicator(main+0x18f) [0x5619b1b7fc4f] -> /lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xeb) [0x7ff8f52a309b] -> dovecot/replicator(_start+0x2a) [0x5619b1b7fd0a]
> replicator: Fatal: master: service(replicator): child 175 killed with signal 6 (core not dumped - https://dovecot.org/bugreport.html#coredumps - set /proc/sys/fs/suid_dumpable to 2)
Not having observed this behaviour on physical or virtual Linux machines, we are pretty much out of ideas on
what to do. https://dovecot.org/pipermail/dovecot/2019-November/117467.html sounds like being a related issue.
Thank you in advance for any hints and replies.
root at x:/# doveconf -n
# 2.3.15 (0503334ab1): /usr/local/etc/dovecot/dovecot.conf
# Pigeonhole version 0.5.15 (e6a84e31)
# OS: Linux 4.9.0-16-amd64 x86_64 Debian 10.10
# Hostname: x
auth_cache_negative_ttl = 1 mins
auth_cache_size = 10 k
auth_master_user_separator = "#"
auth_mechanisms = plain login
auth_verbose_passwords = sha1
doveadm_password = # hidden, use -P to show it
doveadm_port = 12345
first_valid_uid = 8
imap_hibernate_timeout = 1 mins
imap_id_log = *
lmtp_save_to_detail_mailbox = yes
log_path = /dev/stderr
log_timestamp =
login_log_format_elements = user=<%u> method=%m rip=%r lip=%l mpid=%e %c session=<%{session}> %k
mail_access_groups = dovecot
mail_attribute_dict = file:%h/Maildir/dovecot-attributes
mail_gid = mail
mail_location = maildir:~/Maildir/
mail_plugins = acl quota mail_log notify replication
mail_uid = mail
managesieve_notify_capability = mailto
managesieve_sieve_capability = fileinto reject envelope encoded-character vacation subaddress comparator-i;ascii-numeric relational regex imap4flags copy include variables body enotify environment mailbox date index ihave duplicate mime foreverypart extracttext vacation-seconds
passdb {
args = /etc/dovecot/conf.d/ldap-searches/ldap.conf
driver = ldap
}
plugin {
acl = vfile
acl_defaults_from_inbox = yes
acl_shared_dict = file:/data/acl_shared_dict/acl_shared_dict
mail_replica = tcps:x
sieve_extensions = +vacation-seconds
sieve_vacation_min_period = 1m
}
pop3_uidl_format = %f
postmaster_address = x
protocols = imap lmtp pop3 sieve
replication_dsync_parameters = -d -N -l 60 -U
service aggregator {
fifo_listener replication-notify-fifo {
group = mail
mode = 0666
user = mail
}
unix_listener replication-notify {
group = mail
mode = 0666
user = mail
}
}
service auth-worker {
user = $default_internal_user
}
service auth {
unix_listener /socket/dovecot/x/sasl-server {
group = postfix
mode = 0660
user = postfix
}
}
service doveadm {
inet_listener {
port = 12345
ssl = yes
}
}
service imap-hibernate {
unix_listener imap-hibernate {
user = mail
}
}
service imap {
unix_listener imap-master {
group = dovecot
mode = 0660
}
}
service lmtp {
inet_listener {
port = 24
ssl = yes
}
unix_listener /socket/dovecot/x/lmtp-server {
mode = 0600
user = postfix
}
}
service replicator {
process_min_avail = 1
unix_listener replicator-doveadm {
group = mail
mode = 0600
user = mail
}
}
ssl_cert = </acme/live/x/cert+intermediate.pem
ssl_cipher_list = ECDHE+AESGCM:ECDHE+CHACHA20
ssl_client_ca_dir = /etc/ssl/certs/
ssl_client_cert = </acme/live/x/cert+intermediate.pem
ssl_client_key = # hidden, use -P to show it
ssl_dh = # hidden, use -P to show it
ssl_key = # hidden, use -P to show it
ssl_options = no_ticket
ssl_prefer_server_ciphers = yes
userdb {
driver = prefetch
}
userdb {
args = /etc/dovecot/conf.d/ldap-searches/ldap.conf
driver = ldap
}
userdb {
args = /etc/dovecot/conf.d/ldap-searches/ldap-catchall.conf
driver = ldap
}
verbose_proctitle = yes
protocol imap {
imap_metadata = yes
mail_plugins = acl quota mail_log notify replication fts fts_squat imap_acl imap_quota mail_log notify
}
protocol lmtp {
mail_plugins = acl quota mail_log notify replication sieve notify mail_log
ssl = required
ssl_ca = </etc/ssl/certs/ca-certificates.crt
ssl_min_protocol = TLSv1.3
ssl_verify_client_cert = yes
}
More information about the dovecot
mailing list