/* Copyright © 2016 Brandon L Black <blblack@gmail.com>
 *
 * This file is part of gdnsd.
 *
 * gdnsd is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * gdnsd is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with gdnsd.  If not, see <http://www.gnu.org/licenses/>.
 *
 */

#include <config.h>
#include "css.h"
#include "csc.h"
#include "cs.h"
#include "main.h"
#include "statio.h"
#include "main.h"
#include "socks.h"
#include "chal.h"

#include <gdnsd/compiler.h>
#include <gdnsd/alloc.h>
#include <gdnsd/log.h>
#include <gdnsd/paths.h>
#include <gdnsd/net.h>
#include <gdnsd/misc.h>
#include "plugins/mon.h"

#include <stddef.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/file.h>
#include <sys/wait.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/tcp.h>

// makes sides of int[] from pipe2() clearer
#define PIPE_RD 0
#define PIPE_WR 1

static const char base_sock[] = "control.sock";
static const char base_lock[] = "control.lock";

typedef enum {
    READING_REQ,
    READING_DATA,
    WAITING_SERVER,
    WRITING_RESP,
    WRITING_RESP_FDS,
    WRITING_RESP_DATA
} css_cstate_t;

struct css_conn_s_;
typedef struct css_conn_s_ css_conn_t;

struct css_conn_s_ {
    css_conn_t* next; // linked-list for cleanup
    css_conn_t* prev;
    css_t* css;
    csbuf_t rbuf;
    csbuf_t wbuf;
    char* data;
    ev_io w_read;
    ev_io w_write;
    int fd;
    size_t size;
    size_t size_done;
    css_cstate_t state;
    ctl_addr_t* ctl_addr; // if TCP, points at perms
};

typedef struct {
    css_conn_t** q;
    size_t len;
} conn_queue_t;

static void conn_queue_add(conn_queue_t* queue, css_conn_t* c)
{
    queue->q = xrealloc_n(queue->q, queue->len + 1, sizeof(*queue->q));
    queue->q[queue->len++] = c;
}

static void conn_queue_clear(conn_queue_t* queue)
{
    queue->len = 0;
    if (queue->q) {
        free(queue->q);
        queue->q = NULL;
    }
}

typedef struct {
    css_t* css;
    ctl_addr_t* ctl_addr; // points at &css->socks_cfg->ctl_addrs[x]
    ev_io w_tcp_accept; // holds the listen fd inside as well
} tcp_lsnr_t;

struct css_s_ {
    int fd;
    int lock_fd;
    uint32_t status_v;
    uint32_t status_d;
    ev_io w_accept;
    ev_timer w_replace;
    tcp_lsnr_t* tcp_lsnrs;
    struct ev_loop* loop;
    css_conn_t* clients;
    conn_queue_t reload_zones_queued;
    conn_queue_t reload_zones_active;
    char* argv0;
    socks_cfg_t* socks_cfg;
    css_conn_t* replace_conn_ctl;
    css_conn_t* replace_conn_dmn;
    int* handoff_fds;
    size_t handoff_fds_count;
    pid_t replacement_pid;
};

static void swap_reload_zones_queues(css_t* css)
{
    conn_queue_t x;
    memcpy(&x, &css->reload_zones_queued, sizeof(x));
    memcpy(&css->reload_zones_queued, &css->reload_zones_active, sizeof(x));
    memcpy(&css->reload_zones_active, &x, sizeof(x));
}

F_NONNULL
static void css_conn_cleanup(css_conn_t* c)
{
    css_t* css = c->css;
    gdnsd_assert(css);

    if (c == css->replace_conn_ctl)
        css->replace_conn_ctl = NULL;

    if (c == css->replace_conn_dmn) {
        css->replace_conn_dmn = NULL;
        // If the replacement daemon drops the csock connection or there's some
        // kind of communications error with it that causes us to drop the
        // connection, assume it's failing and send it a SIGKILL, letting our
        // PID watcher do the rest of the cleanup when it exits.
        if (css->replacement_pid) {
            log_err("REPLACE[old daemon]: Communications error with new daemon at pid %li, killing it with SIGKILL",
                    (long)css->replacement_pid);
            kill(css->replacement_pid, SIGKILL);
        }
    }

    // stop/free io-related things
    if (c->data)
        free(c->data);
    ev_io* w_read = &c->w_read;
    ev_io_stop(css->loop, w_read);
    ev_io* w_write = &c->w_write;
    ev_io_stop(css->loop, w_write);
    if (c->fd >= 0)
        close(c->fd);

    // remove from linked list
    if (c == css->clients)
        css->clients = c->next;
    if (c->prev)
        c->prev->next = c->next;
    if (c->next)
        c->next->prev = c->prev;
    free(c);
}

F_NONNULL
static bool respond_blocking_ack(css_conn_t* c)
{
    gdnsd_assert(c->css);
    gdnsd_assert(c->state == WAITING_SERVER);
    c->wbuf.key = RESP_ACK;
    csbuf_set_v(&c->wbuf, 0);
    c->wbuf.d = 0;
    c->state = WRITING_RESP;
    ssize_t pktlen = send(c->fd, c->wbuf.raw, 8, 0);
    if (pktlen != 8) {
        log_err("blocking control socket write of 8 bytes failed with retval %zi, closing: %s", pktlen, logf_errno());
        css_conn_cleanup(c);
        return true;
    }
    return false;
}

F_NONNULL
static void css_conn_write_data(css_conn_t* c)
{
    gdnsd_assert(c->state == WRITING_RESP_DATA);
    gdnsd_assert(c->data);
    gdnsd_assert(c->size);
    gdnsd_assert(c->size > c->size_done);
    const size_t wanted = c->size - c->size_done;
    const ssize_t pktlen = send(c->fd, &c->data[c->size_done], wanted, MSG_DONTWAIT);
    if (pktlen < 0) {
        if (ERRNO_WOULDBLOCK)
            return;
        log_err("control socket write of %zu bytes failed with retval %zi, closing: %s", wanted, pktlen, logf_errno());
        css_conn_cleanup(c);
        return;
    }

    c->size_done += (size_t)pktlen;
    if (c->size_done == c->size) {
        free(c->data);
        c->data = NULL;
        c->size = 0;
        c->size_done = 0;
        ev_io* w_write = &c->w_write;
        ev_io_stop(c->css->loop, w_write);
        ev_io* w_read = &c->w_read;
        ev_io_start(c->css->loop, w_read);
        c->state = READING_REQ;
    }
}

F_NONNULL
static bool css_conn_write_resp(css_conn_t* c)
{
    gdnsd_assert(c->state == WRITING_RESP || c->state == WRITING_RESP_FDS);

    union {
        struct cmsghdr c;
        char cmsg_buf[CMSG_SPACE(sizeof(int) * SCM_MAX_FDS)];
    } u;
    struct iovec iov = { .iov_base = c->wbuf.raw, .iov_len  = 8 };
    struct msghdr msg;
    memset(&msg, 0, sizeof(msg));
    msg.msg_iov = &iov;
    msg.msg_iovlen = 1;

    size_t send_fd_count = SCM_MAX_FDS;
    if (c->state == WRITING_RESP_FDS) {
        gdnsd_assert(c->size > c->size_done);
        const size_t fd_todo = c->size - c->size_done;
        if (fd_todo < SCM_MAX_FDS)
            send_fd_count = fd_todo;
        const size_t send_fd_len = sizeof(int) * send_fd_count;
        memset(u.cmsg_buf, 0, sizeof(u.cmsg_buf));
        msg.msg_control = u.cmsg_buf;
        msg.msg_controllen = CMSG_LEN(send_fd_len);
        struct cmsghdr* cmsg = CMSG_FIRSTHDR(&msg);
        gdnsd_assert(cmsg);
        cmsg->cmsg_level = SOL_SOCKET;
        cmsg->cmsg_type = SCM_RIGHTS;
        cmsg->cmsg_len = CMSG_LEN(send_fd_len);
        memcpy(CMSG_DATA(cmsg), &c->css->handoff_fds[c->size_done], send_fd_len);
    }

    ssize_t pktlen = sendmsg(c->fd, &msg, MSG_DONTWAIT);
    if (pktlen != 8) {
        if (pktlen < 0 && ERRNO_WOULDBLOCK)
            return false;
        log_err("control socket write of 8 bytes failed with retval %zi, closing: %s", pktlen, logf_errno());
        css_conn_cleanup(c);
        return false;
    }

    if (c->state == WRITING_RESP_FDS) {
        c->size_done += send_fd_count;
        if (c->size_done < c->size)
            return false;
        c->size = 0;
        c->size_done = 0;
    } else if (c->data) {
        c->state = WRITING_RESP_DATA;
        return true;
    }

    ev_io* w_write = &c->w_write;
    ev_io_stop(c->css->loop, w_write);
    ev_io* w_read = &c->w_read;
    ev_io_start(c->css->loop, w_read);
    c->state = READING_REQ;
    return false;
}

F_NONNULL
static void css_conn_write(struct ev_loop* loop V_UNUSED, ev_io* w, int revents V_UNUSED)
{
    gdnsd_assert(revents == EV_WRITE);
    css_conn_t* c = w->data;
    gdnsd_assert(c);
    gdnsd_assert(c->state == WRITING_RESP || c->state == WRITING_RESP_FDS || c->state == WRITING_RESP_DATA);


    if (c->state != WRITING_RESP_DATA && !css_conn_write_resp(c))
        return;
    css_conn_write_data(c);
}

// If "data" is set, it's a buffer of extended response data to send after the
// initial 8-byte response (and then free once sent), and "d" contains the
// length of the data.
// If "send_fds" is set, send the SCM_RIGHTS fd list response for REQ_TAKE.
// "send_fds" requires: key=RESP_ACK, v=0, d=0, data=NULL
F_NONNULLX(1)
static void respond(css_conn_t* c, const char key, const uint32_t v, const uint32_t d, char* data, bool send_fds)
{
    gdnsd_assert(c->css);
    gdnsd_assert(c->state == WAITING_SERVER);
    gdnsd_assert(v <= 0xFFFFFF);
    gdnsd_assert(!data || !send_fds); // we don't support setting both

    c->wbuf.key = key;
    csbuf_set_v(&c->wbuf, v);
    c->wbuf.d = d;
    c->state = WRITING_RESP;
    if (data) {
        c->data = data;
        c->size = d;
        c->size_done = 0;
    } else if (send_fds) {
        gdnsd_assert(key == RESP_ACK);
        gdnsd_assert(!v);
        gdnsd_assert(!d);
        c->state = WRITING_RESP_FDS;
        csbuf_set_v(&c->wbuf, c->css->handoff_fds_count);
        c->size = c->css->handoff_fds_count;
        c->size_done = 0;
    }
    ev_io* w_write = &c->w_write;
    ev_io_start(c->css->loop, w_write);
}

F_NONNULL
static void respond_tak2(struct ev_loop* loop, css_conn_t* c)
{
    size_t csets_count = 0;
    size_t csets_size = 0;
    uint8_t* csets_data = csets_serialize(loop, &csets_count, &csets_size);
    respond(c, RESP_ACK, (uint32_t)csets_count, (uint32_t)csets_size, (char*)csets_data, false);
}

bool css_stop_ok(const css_t* css)
{
    return !css->replacement_pid;
}

F_NONNULL
static void css_watch_replace(struct ev_loop* loop, ev_timer* w, int revents V_UNUSED)
{
    gdnsd_assert(revents == EV_TIMER);
    css_t* css = w->data;
    gdnsd_assert(css);
    gdnsd_assert(css->replacement_pid);

    // libev's default SIGCHLD handler auto-reaps for us
    // If the process that was attempting a replace operation died, and we're
    // still here, so we have some cleanup to do...
    if (kill(css->replacement_pid, 0)) {
        log_err("REPLACE[old daemon]: New daemon at PID %li died, resuming normal operations",
                (long)css->replacement_pid);
        ev_timer_stop(loop, w);

        if (css->replace_conn_ctl)
            respond(css->replace_conn_ctl, RESP_FAIL, 0, 0, NULL, false);

        if (css->replace_conn_dmn)
            css_conn_cleanup(css->replace_conn_dmn);

        // re-set our states so that further stop/replace/replace actions can happen
        css->replacement_pid = 0;
        css->replace_conn_ctl = NULL;
        css->replace_conn_dmn = NULL;

        // Re-start our accept watcher
        ev_io* w_accept = &css->w_accept;
        ev_io_start(css->loop, w_accept);
        for (unsigned i = 0; i < css->socks_cfg->num_ctl_addrs; i++) {
            ev_io* w_tcp_accept = &css->tcp_lsnrs[i].w_tcp_accept;
            ev_io_start(css->loop, w_tcp_accept);
        }
    }
}

// spawn_replacement() (and its subfunction for the forked processes):
//   We have to do a double-fork here to satisfy systemd, otherwise when we
// notify it of the new MainPID from the new child while the old parent daemon
// still exists, the new child's parent isn't (yet) systemd, and so it
// considers it an "alien" MainPID (which it complains about, and never
// un-complains or fixes it when the old parent daemon eventually exits,
// re-parenting the new child to systemd properly via orphanage).
//   Note also that if we don't serially reap (waitpid) the middle PID, we'd
// face a race on whether the re-parenting to systemd happens before the
// notification from the new child.  libev already has a SIGCHLD auto-reaper
// running, which will race with our own reaper to no ill effect.  We just need
// to be sure the pid is gone completely before continuing.
//   However, we also still need to track the final PID of the new child in the
// original daemon, in order to prevent races between multiple replacements, so
// we'll also have to set up a pipe() to communicate the final PID back to the
// parent from the middle process.
// Thanks, systemd :P

// Everything after the first fork happens in these first two functions, for
// clarity about which lines of code are executing in which process context.
// Because this is a forked child process and the parent has multiple threads
// running, everything here must be async-signal-safe!

// helper function used a few times below:
F_NONNULL
static void send_pidval_(const int writefd, const uint32_t pidval)
{
    if (write(writefd, &pidval, 4) != 4)
        log_fatal_safe("write() of PID failed during replacement spawn");
}

// Handles the second fork into the real exec, and also sends back a pid value
// (or zero for certain error paths) through the pipe socket to the parent.
F_NORETURN F_NONNULL
static void replacement_proc(const char* argv0, const char* cfpath, const char* flags, const int* pipefd)
{
    close(pipefd[PIPE_RD]); // only the parent reads from the pipe

    const pid_t replacement_pid = fork();
    if (replacement_pid == -1) {
        send_pidval_(pipefd[PIPE_WR], 0);
        log_fatal_safe("second fork() failed during replacement spawn");
    }

    if (!replacement_pid) { // final child, new proc for just this block:
        // write side of pipe doesn't need close here, because it's O_CLOEXEC
        gdnsd_reset_signals_for_exec();
        execlp(argv0, argv0, "-c", cfpath, flags, "start", NULL);
        send_pidval_(pipefd[PIPE_WR], 0);
        log_fatal_safe("execlp() failed during replacement spawn");
    }

    send_pidval_(pipefd[PIPE_WR], (uint32_t)replacement_pid);
#ifdef GDNSD_VALGRIND
    execl("/bin/true", "/bin/true", NULL);
#endif
    _exit(0);
}

static pid_t spawn_replacement(const char* argv0)
{
    // 0  -> Definitely failed to launch
    // -1 -> Maybe launched successfully or not, but either way lost comms and didn't get a PID.
    // >0 -> PID of at least initially-successful launch
    pid_t retval = 0;

    // Set up the more-complicated exec args, to be used much deeper during
    // execlp() of the final replacement child
    const char* cfpath = gdnsd_get_config_dir();
    char flags[5] = { '-', 'R', '\0', '\0', '\0' };
    unsigned fidx = 2;
    if (gdnsd_log_get_debug())
        flags[fidx++] = 'D';
    if (gdnsd_log_get_syslog())
        flags[fidx++] = 'l';

    // Before forking, block all signals and save the old mask
    //   to avoid a race condition where local sighandlers execute
    //   in the child between fork and exec().
    sigset_t all_sigs;
    sigfillset(&all_sigs);
    sigset_t saved_mask;
    sigemptyset(&saved_mask);
    if (pthread_sigmask(SIG_SETMASK, &all_sigs, &saved_mask)) {
        log_err("replace failure: pthread_sigmask() failed");
        return retval;
    }

    int pipefd[2];
    if (pipe2(pipefd, O_CLOEXEC)) {
        log_err("replace failure: pipe2(O_CLOEXEC) failed: %s", logf_errno());
        return retval;
    }

    pid_t middle_pid = fork();
    if (middle_pid == -1) {
        log_err("replace failure: fork() failed: %s", logf_errno());
        close(pipefd[PIPE_WR]);
        close(pipefd[PIPE_RD]);
        return retval;
    }

    // The forked middle proc executes replacement_proc() and does not return
    // to this function!
    if (!middle_pid)
        replacement_proc(argv0, cfpath, flags, pipefd);

    // restore previous signal mask from before fork
    // This really should not fail, since saved_mask is right on the stack here
    // and came from the same interface.  Fatal is reasonable here, as we're
    // really in an unknown buggy situation for the main process at that point:
    if (pthread_sigmask(SIG_SETMASK, &saved_mask, NULL))
        log_fatal("pthread_sigmask() failed");

    // Close write-side of pipe in parent (middle proc does the writing)
    close(pipefd[PIPE_WR]);

    // Read the PID of the new daemon, sent to us by the middle proc.  If we
    // read a zero for a definite failure-to-launch, that passes on directly to
    // the caller.  A failure-to-read returns the special value -1, indicating
    // indeterminate child status.
    uint32_t recvpid;
    ssize_t readrv = read(pipefd[PIPE_RD], &recvpid, 4);
    if (readrv == 4)
        retval = (pid_t)recvpid;
    else
        retval = (pid_t) -1;
    close(pipefd[PIPE_RD]);

    // Reap the middle PID before continuing, so that the final child is
    // known to be re-parented before it signals success (e.g. via systemd
    // notification).  We'll log non-fatal errors here if something looks
    // fishy, but ultimately the retval status is determined based on the
    // success of the pipe read above.
    int status;
    pid_t wp_rv = waitpid(middle_pid, &status, 0);
    if (wp_rv < 0) {
        // We can assume ECHILD means the libev SIGCHLD handler beat us to waitpid()
        if (errno != ECHILD)
            log_err("waitpid(%li) for temporary middle process during replacement spawn failed: %s",
                    (long)middle_pid, logf_errno());
    } else {
        if (wp_rv != middle_pid)
            log_err("waitpid(%li) for temporary middle process during replacement spawn caught process %li instead",
                    (long)middle_pid, (long)wp_rv);
        if (status)
            log_err("waitpid(%li) for temporary middle process during replacement spawn returned bad status %i",
                    (long)middle_pid, status);
    }

    return retval;
}

// When a takeover starts (replacement_pid is assigned), send an immediate
// RESP_LATR to all waiting reload-zones clients (even active ones with a
// thread already running), so they'll retry against the new daemon.
static void latr_all_reloaders(css_t* css)
{
    for (size_t i = 0; i < css->reload_zones_active.len; i++) {
        log_info("REPLACE[old daemon]: Deferring reload-zones request while replace in progress");
        respond(css->reload_zones_active.q[i], RESP_LATR, 0, 0, NULL, false);
    }
    for (size_t i = 0; i < css->reload_zones_queued.len; i++) {
        log_info("REPLACE[old daemon]: Deferring reload-zones request while replace in progress");
        respond(css->reload_zones_queued.q[i], RESP_LATR, 0, 0, NULL, false);
    }
    conn_queue_clear(&css->reload_zones_active);
    conn_queue_clear(&css->reload_zones_queued);
}

F_NONNULL
static void recv_challenge_data(struct ev_loop* loop, ev_io* w, css_conn_t* c, const css_t* css)
{
    gdnsd_assert(c->data);
    gdnsd_assert(c->size);
    gdnsd_assert(c->size > c->size_done);
    size_t wanted = c->size - c->size_done;
    ssize_t pktlen = recv(c->fd, &c->data[c->size_done], wanted, MSG_DONTWAIT);
    if (pktlen <= 0) {
        if (pktlen < 0 && ERRNO_WOULDBLOCK)
            return;
        if (pktlen == 0)
            log_err("control socket client disconnected when we expected %zu more bytes from it", wanted);
        else
            log_err("control socket read of %zu data bytes failed with retval %zi, closing: %s", wanted, pktlen, logf_errno());
        css_conn_cleanup(c);
        return;
    }

    c->size_done += (size_t)pktlen;

    if (c->size_done == c->size) {
        ev_io_stop(loop, w);
        c->state = WAITING_SERVER;

        char resp_key = RESP_ACK;
        if (css->replacement_pid) {
            log_info("REPLACE[old daemon]: Deferring a new acme-dns-01 request while replace in progress");
            resp_key = RESP_LATR;
        } else if (cset_create(loop, 0, csbuf_get_v(&c->rbuf), c->size_done, (uint8_t*)c->data)) {
            resp_key = RESP_FAIL;
        }

        free(c->data);
        c->data = NULL;
        c->size = 0;
        c->size_done = 0;
        respond(c, resp_key, 0, 0, NULL, false);
    }
}

F_NONNULL
static void handle_req_stop(css_conn_t* c, css_t* css)
{
    if (css->replacement_pid) {
        if (c != css->replace_conn_dmn) {
            log_info("REPLACE[old daemon]: Deferring a new stop request while replace in progress");
            respond(c, RESP_LATR, 0, 0, NULL, false);
            return;
        } else {
            log_info("REPLACE[old daemon]: Exiting cleanly at request of new daemon");
            // Note from here we won't re-enter the eventloop anyways, so
            // no further requests can be processed and the replacement_pid
            // flag isn't very useful anymore.  Explicitly re-setting it to
            // zero avoids the eventual css_conn_cleanup of this connection
            // (during css_delete(), or due to some communications failure
            // with the blocking acks below) trying to kill the new daemon
            // off because it thinks it's a fail-to-takeover sort of
            // situation.
            css->replacement_pid = 0;
        }
    } else {
        log_info("Exiting cleanly due to control socket client request");
    }
    // Note this is the point of no return for the old daemon in "replace",
    // as we'll never re-enter the main thread's runtime eventloop to
    // process further control socket message (or other events).
    ev_break(css->loop, EVBREAK_ALL);
    // ACK to the client that sent REQ_STOP
    // In non-replace cases (plain stop from e.g. gdnsdctl), set the fd
    // to -1 here so that we don't close it during css_delete, as the
    // response above was our last interaction with it.  In replace cases,
    // there's one more interaction during the final stats handoff, and the
    // new daemon doesn't wait on our close anyways.
    if (!respond_blocking_ack(c) && c != css->replace_conn_dmn)
        c->fd = -1;
    // If "gdnsdctl replace" is connected and driving the process, finally
    // give it an ACK response to its REQ_REPL, as we're now past the point
    // of no return on the replace operation, and also set its fd to -1 to
    // let it close as the process dies as above.
    if (css->replace_conn_ctl) {
        gdnsd_assert(c == css->replace_conn_dmn);
        if (!respond_blocking_ack(css->replace_conn_ctl))
            css->replace_conn_ctl->fd = -1;
    }
}

F_NONNULL
static void handle_req_zrel(css_conn_t* c, css_t* css)
{
    if (css->replacement_pid) {
        log_info("REPLACE[old daemon]: Deferring a new reload-zones request while replace in progress");
        respond(c, RESP_LATR, 0, 0, NULL, false);
        return;
    }
    conn_queue_add(&css->reload_zones_queued, c);
    if (!css->reload_zones_active.len) {
        swap_reload_zones_queues(css);
        spawn_async_zones_reloader_thread();
    }
}

F_NONNULL
static void handle_req_repl(css_conn_t* c, css_t* css)
{
    if (css->replacement_pid) {
        log_info("REPLACE[old daemon]: Deferring a new replace request while another replace already in progress");
        respond(c, RESP_LATR, 0, 0, NULL, false);
        return;
    }
    log_debug("REPLACE[old daemon]: Accepting replace command, spawning replacement server...");
    gdnsd_assert(!css->replace_conn_ctl);
    gdnsd_assert(!css->replace_conn_dmn);
    css->replacement_pid = spawn_replacement(css->argv0);
    if (css->replacement_pid < (pid_t)0) {
        log_err("REPLACE[old daemon]: Replacement launch operation *may* have failed, resuming full service for now.  There is a chance the replacement is launching correctly but is untracked, and could succeed as an independent takover daemon shortly");
        css->replacement_pid = 0;
        respond(c, RESP_FAIL, 0, 0, NULL, false);
    } else if (!css->replacement_pid) {
        log_err("REPLACE[old daemon]: Replacement launch operation definitely failed, resuming full service");
        respond(c, RESP_FAIL, 0, 0, NULL, false);
    } else {
        css->replace_conn_ctl = c;
        log_info("REPLACE[old daemon]: Accepted replace command, spawned replacement daemon at PID %li", (long)css->replacement_pid);
        ev_timer* w_replace = &css->w_replace;
        ev_timer_start(css->loop, w_replace);
        latr_all_reloaders(css);
    }
}

F_NONNULL
static void handle_req_tak1(css_conn_t* c, css_t* css)
{
    const pid_t take_pid = (pid_t)c->rbuf.d;
    if (css->replacement_pid && css->replacement_pid != take_pid) {
        log_warn("REPLACE[old daemon]: Denying takeover notification from PID %li while replace is already in progress with PID %li", (long)take_pid, (long)css->replacement_pid);
        // could argue for LATR or FAIL here, but currently the new daemon doesn't wait and retry anyways
        respond(c, RESP_LATR, 0, 0, NULL, false);
        return;
    }
    log_debug("REPLACE[old daemon]: Accepted takeover notification from PID %li", (long)take_pid);
    css->replacement_pid = take_pid;
    gdnsd_assert(!css->replace_conn_dmn);
    css->replace_conn_dmn = c;
    ev_timer* w_replace = &css->w_replace;
    ev_timer_start(css->loop, w_replace);
    latr_all_reloaders(css);
    respond(c, RESP_ACK, 0, 0, NULL, false);
}

// Common 3-way logging function for the next two handlers
static void log_illegal_takeover(const char phase, const long take_pid, const long repl_pid)
{
    if (!repl_pid)
        log_warn("REPLACE[old daemon]: Denying illegal takeover phase %c from PID %li without pre-notification", phase, take_pid);
    else if (take_pid != repl_pid)
        log_warn("REPLACE[old daemon]: Denying illegal takeover phase %c from PID %li while replace is already in progress with PID %li", phase, take_pid, repl_pid);
    else
        log_warn("REPLACE[old daemon]: Denying illegal takeover phase %c from PID %li which did not arrive on the existing takeover socket", phase, take_pid);
}

F_NONNULL
static void handle_req_tak2(css_conn_t* c, const css_t* css)
{
    const pid_t take_pid = (pid_t)c->rbuf.d;
    if (!css->replacement_pid || take_pid != css->replacement_pid || c != css->replace_conn_dmn) {
        log_illegal_takeover('2', (long)take_pid, (long)css->replacement_pid);
        respond(c, RESP_FAIL, 0, 0, NULL, false);
        css_conn_cleanup(c);
        return;
    }
    log_debug("REPLACE[old daemon]: Accepted takeover phase 2 (challenge data req) from PID %li", (long)take_pid);
    respond_tak2(css->loop, c);
}

F_NONNULL
static void handle_req_take(css_conn_t* c, css_t* css)
{
    const pid_t take_pid = (pid_t)c->rbuf.d;
    if (!css->replacement_pid || take_pid != css->replacement_pid || c != css->replace_conn_dmn) {
        log_illegal_takeover('3', (long)take_pid, (long)css->replacement_pid);
        respond(c, RESP_FAIL, 0, 0, NULL, false);
        css_conn_cleanup(c);
        return;
    }
    gdnsd_assert(css->handoff_fds_count >= 2LU);
    const size_t dns_fds_send = css->handoff_fds_count - 2LU;
    log_info("REPLACE[old daemon]: Accepting takeover request from replacement PID %li, sending %zu DNS sockets", (long)take_pid, dns_fds_send);
    ev_io* w_accept = &css->w_accept;
    ev_io_stop(css->loop, w_accept); // there can be only one
    for (unsigned i = 0; i < css->socks_cfg->num_ctl_addrs; i++) {
        ev_io* w_tcp_accept = &css->tcp_lsnrs[i].w_tcp_accept;
        ev_io_stop(css->loop, w_tcp_accept);
    }
    respond(c, RESP_ACK, 0, 0, NULL, true);
}

F_NONNULL
static bool tcp_req_allowed(const ctl_addr_t* ctl_addr, char key)
{
    switch (key) {
    case REQ_INFO:
    case REQ_STAT:
    case REQ_STATE:
        return true;
    case REQ_CHAL:
    case REQ_CHALF:
        return ctl_addr->chal_ok;
    case REQ_ZREL:
    case REQ_REPL:
        return ctl_addr->ctl_ok;
    default:
        return false;
    }
}

F_NONNULL
static void css_conn_read(struct ev_loop* loop, ev_io* w, int revents V_UNUSED)
{
    gdnsd_assert(revents == EV_READ);
    css_conn_t* c = w->data;
    gdnsd_assert(c);
    css_t* css = c->css;
    gdnsd_assert(css);
    gdnsd_assert(c->state == READING_REQ || c->state == READING_DATA);

    if (c->state == READING_DATA) {
        // we'd switch below if more than one case, but REQ_CHAL is the only
        // key that causes READING_DATA so far.
        gdnsd_assert(c->rbuf.key == REQ_CHAL);
        recv_challenge_data(loop, w, c, css);
        return;
    }

    const ssize_t pktlen = recv(c->fd, c->rbuf.raw, 8, MSG_DONTWAIT);
    if (pktlen != 8) {
        if (pktlen < 0 && ERRNO_WOULDBLOCK)
            return;
        if (pktlen == 0)
            log_debug("control socket client disconnected cleanly during read");
        else
            log_err("control socket read of 8 bytes failed with retval %zi, closing: %s", pktlen, logf_errno());
        css_conn_cleanup(c);
        return;
    }

    // If this is TCP, check perms and explicitly RESP_DENY if warranted
    if (c->ctl_addr && !tcp_req_allowed(c->ctl_addr, c->rbuf.key)) {
        ev_io_stop(loop, w);
        c->state = WAITING_SERVER;
        respond(c, RESP_DENY, 0, 0, NULL, false);
        return;
    }

    // REQ_CHAL is the only case so far where the client sends data after the
    // 8-byte standard request, using "d" as the raw data length and "v" as the
    // count of challenges sent in the data.
    if (c->rbuf.key == REQ_CHAL) {
        const unsigned count = csbuf_get_v(&c->rbuf);
        const unsigned dlen = c->rbuf.d;
        if (!count || count > CHAL_MAX_COUNT || !dlen || dlen > CHAL_MAX_DLEN) {
            log_err("Challenge request has illegal sizes (%u count, %u data), closing", count, dlen);
            css_conn_cleanup(c);
        } else {
            c->state = READING_DATA;
            c->size_done = 0;
            c->size = dlen;
            c->data = xmalloc(dlen);
        }
        return;
    }

    ev_io_stop(loop, w);
    c->state = WAITING_SERVER;

    double nowish;
    size_t stats_size;
    size_t states_size;
    char* stats_msg;
    char* states_msg;

    switch (c->rbuf.key) {
    case REQ_INFO:
        respond(c, RESP_ACK, css->status_v, css->status_d, NULL, false);
        break;
    case REQ_STOP:
        handle_req_stop(c, css);
        break;
    case REQ_STAT:
        nowish = ev_now(loop);
        stats_size = 0;
        stats_msg = statio_get_json((time_t)nowish, &stats_size);
        gdnsd_assert(stats_size <= UINT32_MAX);
        respond(c, RESP_ACK, 0, (uint32_t)stats_size, stats_msg, false);
        break;
    case REQ_STATE:
        states_size = 0;
        states_msg = gdnsd_mon_states_get_json(&states_size);
        gdnsd_assert(states_size <= UINT32_MAX);
        respond(c, RESP_ACK, 0, (uint32_t)states_size, states_msg, false);
        break;
    case REQ_ZREL:
        handle_req_zrel(c, css);
        break;
    case REQ_CHALF:
        if (css->replacement_pid) {
            log_info("Deferring acme-dns-01-flush request while replace in progress");
            respond(c, RESP_LATR, 0, 0, NULL, false);
        } else {
            cset_flush(loop);
            respond(c, RESP_ACK, 0, 0, NULL, false);
        }
        break;
    case REQ_REPL:
        handle_req_repl(c, css);
        break;
    case REQ_TAK1:
        handle_req_tak1(c, css);
        break;
    case REQ_TAK2:
        handle_req_tak2(c, css);
        break;
    case REQ_TAKE:
        handle_req_take(c, css);
        break;
    default:
        log_err("Unknown request type %hhx from control socket", (uint8_t)c->rbuf.key);
        respond(c, RESP_UNK, 0, 0, NULL, false);
    }
}

F_NONNULL
static css_conn_t* css_accept(css_t* css, const ev_io* w)
{
    const int fd = accept4(w->fd, NULL, NULL, SOCK_NONBLOCK | SOCK_CLOEXEC);

    if (unlikely(fd < 0)) {
        switch (errno) {
        case EAGAIN:
#if EWOULDBLOCK != EAGAIN
        case EWOULDBLOCK:
#endif
        case EINTR:
            break;
        default:
            log_err("control socket early connection failure: %s", logf_errno());
            break;
        }
        return NULL;
    }

    // set up the per-connection state and start reading requests...
    css_conn_t* c = xcalloc(sizeof(*c));
    c->css = css;
    c->fd = fd;
    ev_io* w_read = &c->w_read;
    ev_io_init(w_read, css_conn_read, fd, EV_READ);
    ev_io* w_write = &c->w_write;
    ev_io_init(w_write, css_conn_write, fd, EV_WRITE);
    w_read->data = c;
    w_write->data = c;

    // set up buffer/watcher state to read input length
    c->state = READING_REQ;
    ev_io_start(css->loop, w_read);

    // insert into front of linked list
    if (css->clients) {
        c->next = css->clients;
        css->clients->prev = c;
    }
    css->clients = c;

    return c;
}

F_NONNULL
static void css_accept_unix(struct ev_loop* loop V_UNUSED, ev_io* w, int revents V_UNUSED)
{
    gdnsd_assert(revents == EV_READ);
    css_t* css = w->data;
    gdnsd_assert(css);
    css_accept(css, w);
}

F_NONNULL
static void css_accept_tcp(struct ev_loop* loop V_UNUSED, ev_io* w, int revents V_UNUSED)
{
    gdnsd_assert(revents == EV_READ);
    const tcp_lsnr_t* lsnr = w->data;
    gdnsd_assert(lsnr);
    css_t* css = lsnr->css;
    gdnsd_assert(css);
    css_conn_t* c = css_accept(css, w);
    if (c)
        c->ctl_addr = lsnr->ctl_addr;
}

static void socks_import_fd(const socks_cfg_t* socks_cfg, const int fd)
{
    gdnsd_anysin_t fd_sin;
    memset(&fd_sin, 0, sizeof(fd_sin));
    fd_sin.len = GDNSD_ANYSIN_MAXLEN;

    if (getsockname(fd, &fd_sin.sa, &fd_sin.len) || fd_sin.len > GDNSD_ANYSIN_MAXLEN) {
        if (errno == EBADF)
            log_err("REPLACE[new daemon]: Socket handoff: Ignoring invalid file descriptor %i", fd);
        else if (fd_sin.len > GDNSD_ANYSIN_MAXLEN)
            log_err("REPLACE[new daemon]: Socket handoff: getsockname(%i) returned oversize address, closing", fd);
        else
            log_err("REPLACE[new daemon]: Socket handoff: getsockname(%i) failed, closing: %s", fd, logf_errno());
        if (errno != EBADF)
            close(fd);
        return;
    }

    int fd_sin_type = 0;
    socklen_t fd_sin_type_size = sizeof(fd_sin_type);
    if (getsockopt(fd, SOL_SOCKET, SO_TYPE, &fd_sin_type, &fd_sin_type_size)
            || fd_sin_type_size != sizeof(fd_sin_type)
            || (fd_sin_type != SOCK_DGRAM && fd_sin_type != SOCK_STREAM)) {
        log_err("REPLACE[new daemon]: Socket handoff: cannot get type of fd %i @ %s, closing: %s", fd, logf_anysin(&fd_sin), logf_errno());
        close(fd);
        return;
    }
    const bool fd_sin_is_udp = (fd_sin_type == SOCK_DGRAM);

    for (unsigned i = 0; i < socks_cfg->num_dns_threads; i++) {
        dns_thread_t* dt = &socks_cfg->dns_threads[i];
        if (dt->sock == -1 && dt->is_udp == fd_sin_is_udp
                && !gdnsd_anysin_cmp(&dt->ac->addr, &fd_sin)) {
            dt->sock = fd;
            return;
        }
    }

    log_info("REPLACE[new daemon]: Socket handoff: closing excess socket for address %s", logf_anysin(&fd_sin));
    close(fd);
}

static void socks_import_fds(const socks_cfg_t* socks_cfg, const int* fds, const size_t nfds)
{
    for (size_t i = 0; i < nfds; i++)
        socks_import_fd(socks_cfg, fds[i]);
}

#ifndef SOL_TCP
#define SOL_TCP IPPROTO_TCP
#endif

F_NONNULL F_WUNUSED
static int make_tcp_listener_fd(const gdnsd_anysin_t* addr)
{
    const int fd = socket(addr->sa.sa_family, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, IPPROTO_TCP);
    if (fd < 0)
        log_fatal("Failed to create TCP control socket: %s", logf_errno());
    sockopt_bool_fatal(TCP, addr, fd, SOL_SOCKET, SO_REUSEADDR, 1);
    sockopt_bool_fatal(TCP, addr, fd, SOL_SOCKET, SO_REUSEPORT, 1);
    sockopt_bool_fatal(TCP, addr, fd, SOL_TCP, TCP_NODELAY, 1);
    if (bind(fd, &addr->sa, addr->len))
        log_fatal("bind() of TCP control socket %s failed: %s", logf_anysin(addr), logf_errno());
    if (listen(fd, 100))
        log_fatal("Failed to listen() on control socket %s: %s", logf_anysin(addr), logf_errno());
    log_info("TCP control socket listener initialized @ %s", logf_anysin(addr));
    return fd;
}

F_NONNULL
static void make_tcp_listeners(css_t* css)
{
    gdnsd_assert(css->socks_cfg);
    gdnsd_assert(css->socks_cfg->num_ctl_addrs);
    css->tcp_lsnrs = xcalloc_n(css->socks_cfg->num_ctl_addrs, sizeof(*css->tcp_lsnrs));
    for (unsigned i = 0; i < css->socks_cfg->num_ctl_addrs; i++) {
        tcp_lsnr_t* lsnr = &css->tcp_lsnrs[i];
        lsnr->css = css;
        ctl_addr_t* ca = &css->socks_cfg->ctl_addrs[i];
        lsnr->ctl_addr = ca;
        ev_io* w_tcp_accept = &lsnr->w_tcp_accept;
        const int fd = make_tcp_listener_fd(&ca->addr);
        ev_io_init(w_tcp_accept, css_accept_tcp, fd, EV_READ);
        w_tcp_accept->data = lsnr;
    }
}

F_WUNUSED
static int make_unix_listener_fd(void)
{
    const int fd = socket(AF_UNIX, SOCK_STREAM | SOCK_NONBLOCK | SOCK_CLOEXEC, 0);
    if (fd < 0)
        log_fatal("Creating AF_UNIX socket failed: %s", logf_errno());

    char* sock_path = gdnsd_resolve_path_run(base_sock, NULL);
    struct sockaddr_un addr;
    const socklen_t addr_len = gdnsd_sun_set_path(&addr, sock_path);
    if (unlink(sock_path) && errno != ENOENT)
        log_fatal("unlink(%s) failed: %s", sock_path, logf_errno());

    // umask()-switching around the bind() seems safer against possible
    // perms races on various platforms than doing a chmod between bind()
    // and listen().  Note umask() isn't thread-safe, but css_new() is
    // called before any threads are created.
    const mode_t oldmask = umask(S_IXUSR | S_IRWXG | S_IRWXO); // 0177
    if (bind(fd, (struct sockaddr*)&addr, addr_len))
        log_fatal("bind() of unix domain socket %s failed: %s", sock_path, logf_errno());
    umask(oldmask);

    if (listen(fd, 100))
        log_fatal("Failed to listen() on control socket %s: %s", sock_path, logf_errno());
    free(sock_path);

    return fd;
}

/*********************
 * Public interfaces *
 *********************/

css_t* css_new(const char* argv0, socks_cfg_t* socks_cfg, csc_t** csc_p)
{
    csc_t* csc = NULL;
    if (csc_p) {
        csc = *csc_p;
        gdnsd_assert(csc);
    }

    int sock_fd = -1;
    char* lock_path = gdnsd_resolve_path_run(base_lock, NULL);
    int lock_fd = open(lock_path, O_RDONLY | O_CREAT | O_CLOEXEC, S_IRUSR | S_IWUSR);
    if (lock_fd < 0)
        log_fatal("cannot open control sock lock at %s: %s", lock_path, logf_errno());

    if (flock(lock_fd, LOCK_EX | LOCK_NB)) {
        if (errno != EWOULDBLOCK)
            log_fatal("cannot lock control sock lock at %s: %s", lock_path, logf_errno());
        close(lock_fd);
        lock_fd = -1;
        if (!csc) {
            free(lock_path);
            return NULL;
        }
    } else if (csc) {
        log_warn("REPLACE[new daemon]: old daemon at %li appears to have exited while we were starting, executing a normal non-replace startup!",
                 (long)csc_get_server_pid(csc));
        csc_delete(csc);
        csc = NULL;
        *csc_p = NULL;
    }

    free(lock_path);

    if (csc) {
        csbuf_t req;
        csbuf_t resp;
        memset(&req, 0, sizeof(req));
        req.key = REQ_TAKE;
        req.d = (uint32_t)getpid();
        int* resp_fds = NULL;
        const size_t fds_recvd = csc_txn_getfds(csc, &req, &resp, &resp_fds);
        gdnsd_assert(fds_recvd >= 2U);
        gdnsd_assert(sock_fd == -1);
        gdnsd_assert(lock_fd == -1);
        lock_fd = resp_fds[0];
        sock_fd = resp_fds[1];
        const size_t dns_fd_count = fds_recvd - 2U;
        log_info("REPLACE[new daemon]: Takeover request accepted, received %zu DNS sockets", dns_fd_count);
        socks_import_fds(socks_cfg, &resp_fds[2], dns_fd_count);
        free(resp_fds);
    }

    css_t* css = xcalloc(sizeof(*css));
    css->lock_fd = lock_fd;
    css->argv0 = xstrdup(argv0);
    css->socks_cfg = socks_cfg;
    css->status_d = (uint32_t)getpid();
    css->status_v = csbuf_make_v(PACKAGE_V_MAJOR, PACKAGE_V_MINOR, PACKAGE_V_PATCH);

    if (sock_fd > -1)
        css->fd = sock_fd;
    else
        css->fd = make_unix_listener_fd();

    if (css->socks_cfg->num_ctl_addrs)
        make_tcp_listeners(css);

    ev_io* w_accept = &css->w_accept;
    ev_io_init(w_accept, css_accept_unix, css->fd, EV_READ);
    w_accept->data = css;

    ev_timer* w_replace = &css->w_replace;
    ev_timer_init(w_replace, css_watch_replace, 1.0, 1.0);
    w_replace->data = css;

    return css;
}

void css_start(css_t* css, struct ev_loop* loop)
{
    css->loop = loop;
    ev_io* w_accept = &css->w_accept;
    ev_io_start(css->loop, w_accept);
    for (unsigned i = 0; i < css->socks_cfg->num_ctl_addrs; i++) {
        ev_io* w_tcp_accept = &css->tcp_lsnrs[i].w_tcp_accept;
        ev_io_start(css->loop, w_tcp_accept);
    }
    gdnsd_assert(css->socks_cfg->num_dns_threads);
    css->handoff_fds_count = css->socks_cfg->num_dns_threads + 2U;
    gdnsd_assert(css->handoff_fds_count <= 0xFFFFFF);
    css->handoff_fds = xmalloc_n(css->handoff_fds_count, sizeof(*css->handoff_fds));
    css->handoff_fds[0] = css->lock_fd;
    css->handoff_fds[1] = css->fd;
    for (unsigned i = 0; i < css->socks_cfg->num_dns_threads; i++)
        css->handoff_fds[i + 2] = css->socks_cfg->dns_threads[i].sock;
    log_debug("Entering runtime loop in main thread, listening to control socket");
}

bool css_notify_zone_reloaders(css_t* css, const bool failed)
{
    // Notify log and all waiting control sock clients of success/fail
    for (size_t i = 0; i < css->reload_zones_active.len; i++)
        respond(css->reload_zones_active.q[i], failed ? RESP_FAIL : RESP_ACK, 0, 0, NULL, false);

    // clear out the queue of clients waiting for reload status
    conn_queue_clear(&css->reload_zones_active);

    // Swap queues, and spawn another new update thread if more waiting clients
    // piled up during the previous reload
    swap_reload_zones_queues(css);

    // If the new active queue already had waiters,
    // return true to start another reload
    return !!css->reload_zones_active.len;
}

// During a "replace", this is the final communication over the daemon<->daemon
// control socket, and happens very late.  We're already past the point of no
// return (new sent REQ_STOP to old, and old ACK'd it), stats continuity
// isn't critical to operations, and no further communications are intended
// (including no response to this message) so failures here are non-fatal.
void css_send_stats_handoff(const css_t* css)
{
    // no-op if we don't have a takeover connection from a newer daemon
    if (!css->replace_conn_dmn)
        return;

    const css_conn_t* c = css->replace_conn_dmn;
    size_t dlen = 0;
    char* data = statio_serialize(&dlen);

    csbuf_t handoff;
    memset(&handoff, 0, sizeof(handoff));
    handoff.key = PSH_SHAND;
    csbuf_set_v(&handoff, 0);
    handoff.d = (uint32_t)dlen;
    ssize_t pktlen = send(c->fd, handoff.raw, 8, 0);
    if (pktlen != 8) {
        log_err("REPLACE[old daemon]: Stats handoff failed: blocking control socket write of 8 bytes failed with retval %zi: %s", pktlen, logf_errno());
        free(data);
        return;
    }

    size_t done = 0;
    while (done < dlen) {
        const size_t wanted = dlen - done;
        const ssize_t sent = send(c->fd, &data[done], wanted, 0);
        if (sent < 0) {
            log_err("REPLACE[old daemon]: Stats handoff failed: %zu-byte send() failed: %s", wanted, logf_errno());
            free(data);
            return;
        }
        done += (size_t)sent;
    }

    free(data);
}

void css_delete(css_t* css)
{
    // clean out active connections...
    css_conn_t* c = css->clients;
    while (c) {
        css_conn_t* next = c->next;
        css_conn_cleanup(c);
        c = next;
    }

    // close up and free any TCP listeners
    for (unsigned i = 0; i < css->socks_cfg->num_ctl_addrs; i++) {
        ev_io* w = &css->tcp_lsnrs[i].w_tcp_accept;
        ev_io_stop(css->loop, w);
        close(w->fd);
    }
    if (css->socks_cfg->num_ctl_addrs)
        free(css->tcp_lsnrs);

    // free up the reload queues
    conn_queue_clear(&css->reload_zones_queued);
    conn_queue_clear(&css->reload_zones_active);

    if (css->handoff_fds)
        free(css->handoff_fds);
    ev_io* w_accept = &css->w_accept;
    ev_io_stop(css->loop, w_accept);
    ev_timer* w_replace = &css->w_replace;
    ev_timer_stop(css->loop, w_replace);
    close(css->fd);
    close(css->lock_fd); // Closing the lock fd implicitly clears the lock
    free(css->argv0);
    free(css);
}
