loss of masters over ipsec hoses bind

Matt LaPlante cyberdog3k at gmail.com
Mon Dec 24 03:25:58 UTC 2007


On Dec 23, 2007 8:56 PM, Mark Andrews <Mark_Andrews at isc.org> wrote:
>
>
> >
> > > I'm currently running Bind 9.4.1 (Ubuntu Gutsy).  I have several zones
> > > in master->slave setups, which normally works just fine.  The other
> > > day, however, I ran into an odd problem.  A couple of the slave zones
> > > generally update over an ipsec connected network.  The ipsec
> > > connection went away, and shortly thereafter bind royally wedged
> > > itself, refusing to serve any data (including basic forward lookups)
> > > and was not even responding to rndc restarts.  It took me a good while
> > > of restarting the system and poking around logs to decide to strace
> > > the process, which eventually lead me to removing the ipsec-dependant
> > > slave zones from the config.  As soon as I did this, Bind became
> > > stable again.  Interestingly, zones which updated over public IP space
> > > behaved fine, even if the master server was unreachable.  It was only
> > > zones that were trying to go over the down ipsec connection that hosed
> > > the daemon.
> > >
> > > This whole issue is logged in a bit more detail here, including output
> > > from strace:
> > > https://bugs.launchpad.net/ubuntu/+source/bind/+bug/177489
> > >
> > > I can (apparently) reproduce this issue again with little difficulty,
> > > so I'd be glad to help debug it.
> > >
> > > -
> > > Matt LaPlante
> >
> >       I would say that some I/O is blocking when it shouldn't
> >       with sockets which use ipsec.  If this is the case it is
> >       a kernel bug and named can't do anything to prevent it.
> >       Named marks all sockets as non-blocking.
> >
> >       Mark
>
>        We should be able to prove / disprove the theory above with
>        this patch.  It sets a 2 second alarm for all socket I/O.
>        You need to build named with threaded disabled and run with
>        "named -g <rest of named's arguements>".
>
>        You will get "ALARM: message" printed to stderr.
>
>        Mark
>
> Index: lib/isc/unix/socket.c
> ===================================================================
> RCS file: /proj/cvs/prod/bind9/lib/isc/unix/socket.c,v
> retrieving revision 1.275
> diff -u -r1.275 socket.c
> --- lib/isc/unix/socket.c       14 Dec 2007 03:52:40 -0000      1.275
> +++ lib/isc/unix/socket.c       24 Dec 2007 01:27:38 -0000
> @@ -281,6 +281,18 @@
>
>  #define SOCK_DEAD(s)                   ((s)->references == 0)
>
> +#ifndef ISC_PLATFORM_USETHREADS
> +static const char *alarm_msg = "ALARM: none\n";
> +static void alarm_handler(int foo) {
> +
> +       UNUSED(foo);
> +       /*
> +        * Write message to stderr.
> +        */
> +       write(2, alarm_msg, strlen(alarm_msg));
> +}
> +#endif
> +
>  static void
>  manager_log(isc_socketmgr_t *sockmgr,
>            isc_logcategory_t *category, isc_logmodule_t *module, int level,
> @@ -985,8 +997,16 @@
>        dump_msg(&msghdr);
>  #endif
>
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(2);
> +       alarm_msg = "ALARM: recvmsg\n";
> +#endif
>        cc = recvmsg(sock->fd, &msghdr, 0);
>        recv_errno = errno;
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(0);
> +       alarm_msg = "ALARM: none\n";
> +#endif
>
>  #if defined(ISC_SOCKET_DEBUG)
>        dump_msg(&msghdr);
> @@ -1142,8 +1162,16 @@
>        build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
>
>  resend:
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(2);
> +       alarm_msg = "ALARM: sendmsg\n";
> +#endif
>        cc = sendmsg(sock->fd, &msghdr, 0);
>        send_errno = errno;
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(0);
> +       alarm_msg = "ALARM: none\n";
> +#endif
>
>        /*
>         * Check for error or block condition.
> @@ -2623,6 +2651,7 @@
>        REQUIRE(managerp != NULL && *managerp == NULL);
>
>  #ifndef ISC_PLATFORM_USETHREADS
> +       signal(SIGALRM, alarm_handler);
>        if (socketmgr != NULL) {
>                socketmgr->refs++;
>                *managerp = socketmgr;
> @@ -3281,6 +3310,10 @@
>                goto cleanup;
>        }
>
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(2);
> +       alarm_msg = "ALARM: connect unix\n";
> +#endif
>        if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
>                    sizeof(sockaddr->type.sunix)) < 0) {
>                switch (errno) {
> @@ -3306,6 +3339,10 @@
>                        break;
>                }
>        }
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(0);
> +       alarm_msg = "ALARM: none\n";
> +#endif
>  cleanup:
>        close(s);
>  #else
> @@ -3573,7 +3610,7 @@
>        isc_socket_connev_t *dev;
>        isc_task_t *ntask = NULL;
>        isc_socketmgr_t *manager;
> -       int cc;
> +       int cc, connect_errno;
>        char strbuf[ISC_STRERRORSIZE];
>
>        REQUIRE(VALID_SOCKET(sock));
> @@ -3607,12 +3644,21 @@
>         * outstanding, and it might happen to complete.
>         */
>        sock->peer_address = *addr;
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(2);
> +       alarm_msg = "ALARM: connect\n";
> +#endif
>        cc = connect(sock->fd, &addr->type.sa, addr->length);
> +       connect_errno = errno;
> +#ifndef ISC_PLATFORM_USETHREADS
> +       alarm(0);
> +       alarm_msg = "ALARM: none\n";
> +#endif
>        if (cc < 0) {
> -               if (SOFT_ERROR(errno) || errno == EINPROGRESS)
> +               if (SOFT_ERROR(connect_errno) || connect_errno == EINPROGRESS)
>                        goto queue;
>
> -               switch (errno) {
> +               switch (connect_errno) {
>  #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
>                        ERROR_MATCH(EACCES, ISC_R_NOPERM);
>                        ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
> @@ -3632,8 +3678,9 @@
>
>                sock->connected = 0;
>
> -               isc__strerror(errno, strbuf, sizeof(strbuf));
> -               UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", errno, strbuf);
> +               isc__strerror(connect_errno, strbuf, sizeof(strbuf));
> +               UNEXPECTED_ERROR(__FILE__, __LINE__, "%d/%s", connect_errno,
> +                                strbuf);
>
>                UNLOCK(&sock->lock);
>                isc_event_free(ISC_EVENT_PTR(&dev));


Thanks for taking the time to help me investigate this. :)

I applied the patch and built w/out threading, however I get this
error when trying to run the new binary:

named: pthread_mutex_lock.c:82: __pthread_mutex_lock: Assertion
`mutex->__data.__owner == 0' failed.
Aborted (core dumped)

Running with -g did allow me to see a new error (using the orignal
binary) as follows:

23-Dec-2007 21:37:52.844 socket.c:1173: unexpected error:
23-Dec-2007 21:37:52.848 internal_send: 192.168.1.1#53: Interrupted
system call should be restarted
23-Dec-2007 21:37:52.852 errno2result.c:111: unexpected error:
23-Dec-2007 21:37:52.855 unable to convert errno to isc_result: 85:
Interrupted system call should be restarted
23-Dec-2007 21:38:03.644 socket.c:1173: unexpected error:
23-Dec-2007 21:38:03.644 internal_send: 192.168.1.1#53: Interrupted
system call should be restarted
23-Dec-2007 21:38:03.644 errno2result.c:111: unexpected error:
23-Dec-2007 21:38:03.644 unable to convert errno to isc_result: 85:
Interrupted system call should be restarted
23-Dec-2007 21:38:03.650 zone 1.168.192.in-addr.arpa/IN: refresh:
failure trying master 192.168.1.1#53 (source 0.0.0.0#0): operation
canceled


>
> --
> Mark Andrews, ISC
> 1 Seymour St., Dundas Valley, NSW 2117, Australia
> PHONE: +61 2 9871 4742                 INTERNET: Mark_Andrews at isc.org
>



More information about the bind-users mailing list