loss of masters over ipsec hoses bind
Matt LaPlante
cyberdog3k at gmail.com
Mon Dec 24 03:25:58 UTC 2007
On Dec 23, 2007 8:56 PM, Mark Andrews <Mark_Andrews at isc.org> wrote:
>
>
> >
> > > I'm currently running Bind 9.4.1 (Ubuntu Gutsy). I have several zones
> > > in master->slave setups, which normally works just fine. The other
> > > day, however, I ran into an odd problem. A couple of the slave zones
> > > generally update over an ipsec connected network. The ipsec
> > > connection went away, and shortly thereafter bind royally wedged
> > > itself, refusing to serve any data (including basic forward lookups)
> > > and was not even responding to rndc restarts. It took me a good while
> > > of restarting the system and poking around logs to decide to strace
> > > the process, which eventually lead me to removing the ipsec-dependant
> > > slave zones from the config. As soon as I did this, Bind became
> > > stable again. Interestingly, zones which updated over public IP space
> > > behaved fine, even if the master server was unreachable. It was only
> > > zones that were trying to go over the down ipsec connection that hosed
> > > the daemon.
> > >
> > > This whole issue is logged in a bit more detail here, including output
> > > from strace:
> > > https://bugs.launchpad.net/ubuntu/+source/bind/+bug/177489
> > >
> > > I can (apparently) reproduce this issue again with little difficulty,
> > > so I'd be glad to help debug it.
> > >
> > > -
> > > Matt LaPlante
> >
> > I would say that some I/O is blocking when it shouldn't
> > with sockets which use ipsec. If this is the case it is
> > a kernel bug and named can't do anything to prevent it.
> > Named marks all sockets as non-blocking.
> >
> > Mark
>
> We should be able to prove / disprove the theory above with
> this patch. It sets a 2 second alarm for all socket I/O.
> You need to build named with threaded disabled and run with
> "named -g <rest of named's arguements>".
>
> You will get "ALARM: message" printed to stderr.
>
> Mark
>
> Index: lib/isc/unix/socket.c
> ===================================================================
> RCS file: /proj/cvs/prod/bind9/lib/isc/unix/socket.c,v
> retrieving revision 1.275
> diff -u -r1.275 socket.c
> --- lib/isc/unix/socket.c 14 Dec 2007 03:52:40 -0000 1.275
> +++ lib/isc/unix/socket.c 24 Dec 2007 01:27:38 -0000
> @@ -281,6 +281,18 @@
>
> #define SOCK_DEAD(s) ((s)->references == 0)
>
> +#ifndef ISC_PLATFORM_USETHREADS
> +static const char *alarm_msg = "ALARM: none\n";
> +static void alarm_handler(int foo) {
> +
> + UNUSED(foo);
> + /*
> + * Write message to stderr.
> + */
> + write(2, alarm_msg, strlen(alarm_msg));
> +}
> +#endif
> +
> static void
> manager_log(isc_socketmgr_t *sockmgr,
> isc_logcategory_t *category, isc_logmodule_t *module, int level,
> @@ -985,8 +997,16 @@
> dump_msg(&msghdr);
> #endif
>
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(2);
> + alarm_msg = "ALARM: recvmsg\n";
> +#endif
> cc = recvmsg(sock->fd, &msghdr, 0);
> recv_errno = errno;
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(0);
> + alarm_msg = "ALARM: none\n";
> +#endif
>
> #if defined(ISC_SOCKET_DEBUG)
> dump_msg(&msghdr);
> @@ -1142,8 +1162,16 @@
> build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
>
> resend:
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(2);
> + alarm_msg = "ALARM: sendmsg\n";
> +#endif
> cc = sendmsg(sock->fd, &msghdr, 0);
> send_errno = errno;
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(0);
> + alarm_msg = "ALARM: none\n";
> +#endif
>
> /*
> * Check for error or block condition.
> @@ -2623,6 +2651,7 @@
> REQUIRE(managerp != NULL && *managerp == NULL);
>
> #ifndef ISC_PLATFORM_USETHREADS
> + signal(SIGALRM, alarm_handler);
> if (socketmgr != NULL) {
> socketmgr->refs++;
> *managerp = socketmgr;
> @@ -3281,6 +3310,10 @@
> goto cleanup;
> }
>
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(2);
> + alarm_msg = "ALARM: connect unix\n";
> +#endif
> if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
> sizeof(sockaddr->type.sunix)) < 0) {
> switch (errno) {
> @@ -3306,6 +3339,10 @@
> break;
> }
> }
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(0);
> + alarm_msg = "ALARM: none\n";
> +#endif
> cleanup:
> close(s);
> #else
> @@ -3573,7 +3610,7 @@
> isc_socket_connev_t *dev;
> isc_task_t *ntask = NULL;
> isc_socketmgr_t *manager;
> - int cc;
> + int cc, connect_errno;
> char strbuf[ISC_STRERRORSIZE];
>
> REQUIRE(VALID_SOCKET(sock));
> @@ -3607,12 +3644,21 @@
> * outstanding, and it might happen to complete.
> */
> sock->peer_address = *addr;
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(2);
> + alarm_msg = "ALARM: connect\n";
> +#endif
> cc = connect(sock->fd, &addr->type.sa, addr->length);
> + connect_errno = errno;
> +#ifndef ISC_PLATFORM_USETHREADS
> + alarm(0);
> + alarm_msg = "ALARM: none\n";
> +#endif
> if (cc < 0) {
> - if (SOFT_ERROR(errno) || errno == EINPROGRESS)
> + if (SOFT_ERROR(connect_errno) || connect_errno == EINPROGRESS)
> goto queue;
>
> - switch (errno) {
> + switch (connect_errno) {
> #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
> ERROR_MATCH(EACCES, ISC_R_NOPERM);
> ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
> @@ -3632,8 +3678,9 @@
>
> sock->connected = 0;
>
> - isc__strerror(errno, strbuf, sizeof(strbuf));
> - UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
> + isc__strerror(connect_errno, strbuf, sizeof(strbuf));
> + UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", connect_errno,
> + strbuf);
>
> UNLOCK(&sock->lock);
> isc_event_free(ISC_EVENT_PTR(&dev));
Thanks for taking the time to help me investigate this. :)
I applied the patch and built w/out threading, however I get this
error when trying to run the new binary:
named: pthread_mutex_lock.c:82: __pthread_mutex_lock: Assertion
`mutex->__data.__owner == 0' failed.
Aborted (core dumped)
Running with -g did allow me to see a new error (using the orignal
binary) as follows:
23-Dec-2007 21:37:52.844 socket.c:1173: unexpected error:
23-Dec-2007 21:37:52.848 internal_send: 192.168.1.1#53: Interrupted
system call should be restarted
23-Dec-2007 21:37:52.852 errno2result.c:111: unexpected error:
23-Dec-2007 21:37:52.855 unable to convert errno to isc_result: 85:
Interrupted system call should be restarted
23-Dec-2007 21:38:03.644 socket.c:1173: unexpected error:
23-Dec-2007 21:38:03.644 internal_send: 192.168.1.1#53: Interrupted
system call should be restarted
23-Dec-2007 21:38:03.644 errno2result.c:111: unexpected error:
23-Dec-2007 21:38:03.644 unable to convert errno to isc_result: 85:
Interrupted system call should be restarted
23-Dec-2007 21:38:03.650 zone 1.168.192.in-addr.arpa/IN: refresh:
failure trying master 192.168.1.1#53 (source 0.0.0.0#0): operation
canceled
>
> --
> Mark Andrews, ISC
> 1 Seymour St., Dundas Valley, NSW 2117, Australia
> PHONE: +61 2 9871 4742 INTERNET: Mark_Andrews at isc.org
>
More information about the bind-users
mailing list