twitter · charsyam · Oct 13, 2015 · Oct 21, 2015 · TysonAndre · Apr 28, 2021
diff --git a/src/hashkit/nc_ketama.c b/src/hashkit/nc_ketama.c
@@ -90,12 +90,8 @@ ketama_update(struct server_pool *pool)
         struct server *server = array_get(&pool->server, server_index);
 
         if (pool->auto_eject_hosts) {
-            if (server->next_retry <= now) {
-                server->next_retry = 0LL;
+            if (server->fail == FAIL_STATUS_NORMAL) {
                 nlive_server++;
-            } else if (pool->next_rebuild == 0LL ||
-                       server->next_retry < pool->next_rebuild) {
-                pool->next_rebuild = server->next_retry;
             }
         } else {
             nlive_server++;
@@ -104,7 +100,7 @@ ketama_update(struct server_pool *pool)
         ASSERT(server->weight > 0);
 
         /* count weight only for live servers */
-        if (!pool->auto_eject_hosts || server->next_retry <= now) {
+        if (!pool->auto_eject_hosts || server->fail == 0) {
             total_weight += server->weight;
         }
     }

diff --git a/src/hashkit/nc_modula.c b/src/hashkit/nc_modula.c
@@ -54,12 +54,8 @@ modula_update(struct server_pool *pool)
         struct server *server = array_get(&pool->server, server_index);
 
         if (pool->auto_eject_hosts) {
-            if (server->next_retry <= now) {
-                server->next_retry = 0LL;
+            if (server->fail == FAIL_STATUS_NORMAL) {
                 nlive_server++;
-            } else if (pool->next_rebuild == 0LL ||
-                       server->next_retry < pool->next_rebuild) {
-                pool->next_rebuild = server->next_retry;
             }
         } else {
             nlive_server++;
@@ -68,7 +64,7 @@ modula_update(struct server_pool *pool)
         ASSERT(server->weight > 0);
 
         /* count weight only for live servers */
-        if (!pool->auto_eject_hosts || server->next_retry <= now) {
+        if (!pool->auto_eject_hosts || server->fail == FAIL_STATUS_NORMAL) {
             total_weight += server->weight;
         }
     }

diff --git a/src/hashkit/nc_random.c b/src/hashkit/nc_random.c
@@ -51,12 +51,8 @@ random_update(struct server_pool *pool)
         struct server *server = array_get(&pool->server, server_index);
 
         if (pool->auto_eject_hosts) {
-            if (server->next_retry <= now) {
-                server->next_retry = 0LL;
+            if (server->fail == FAIL_STATUS_NORMAL) {
                 nlive_server++;
-            } else if (pool->next_rebuild == 0LL ||
-                       server->next_retry < pool->next_rebuild) {
-                pool->next_rebuild = server->next_retry;
             }
         } else {
             nlive_server++;

diff --git a/src/nc_client.c b/src/nc_client.c
@@ -187,3 +187,8 @@ client_close(struct context *ctx, struct conn *conn)
 
     conn_put(conn);
 }
+
+void
+client_restore(struct context *ctx, struct conn *conn)
+{
+}
diff --git a/src/nc_client.h b/src/nc_client.h
@@ -24,5 +24,6 @@ bool client_active(struct conn *conn);
 void client_ref(struct conn *conn, void *owner);
 void client_unref(struct conn *conn);
 void client_close(struct context *ctx, struct conn *conn);
+void client_restore(struct context *ctx, struct conn *conn);
 
 #endif
diff --git a/src/nc_conf.c b/src/nc_conf.c
@@ -167,6 +167,7 @@ conf_server_each_transform(void *elem, void *data)
 
     s->next_retry = 0LL;
     s->failure_count = 0;
+    s->fail = FAIL_STATUS_NORMAL;
 
     log_debug(LOG_VERB, "transform to server %"PRIu32" '%.*s'",
               s->idx, s->pname.len, s->pname.data);
@@ -1136,6 +1137,14 @@ conf_pre_validate(struct conf *cf)
     return NC_OK;
 }
 
+static int
+conf_server_pname_cmp(const void *t1, const void *t2)
+{
+    const struct conf_server *s1 = t1, *s2 = t2;
+
+    return string_compare(&s1->pname, &s2->pname);
+}
+
 static int
 conf_server_name_cmp(const void *t1, const void *t2)
 {

diff --git a/src/nc_connection.c b/src/nc_connection.c
@@ -194,6 +194,7 @@ conn_get(void *owner, bool client, bool redis)
 
         conn->close = client_close;
         conn->active = client_active;
+        conn->restore = client_restore;
 
         conn->ref = client_ref;
         conn->unref = client_unref;
@@ -221,6 +222,7 @@ conn_get(void *owner, bool client, bool redis)
 
         conn->close = server_close;
         conn->active = server_active;
+        conn->restore = server_restore;
 
         conn->ref = server_ref;
         conn->unref = server_unref;
@@ -269,6 +271,7 @@ conn_get_proxy(void *owner)
 
     conn->close = proxy_close;
     conn->active = NULL;
+    conn->restore = proxy_restore;
 
     conn->ref = proxy_ref;
     conn->unref = proxy_unref;

diff --git a/src/nc_connection.h b/src/nc_connection.h
@@ -33,6 +33,7 @@ typedef bool (*conn_active_t)(struct conn *);
 
 typedef void (*conn_ref_t)(struct conn *, void *);
 typedef void (*conn_unref_t)(struct conn *);
+typedef void (*conn_restore_t)(struct context *, struct conn *);
 
 typedef void (*conn_msgq_t)(struct context *, struct conn *, struct msg *);
 typedef void (*conn_post_connect_t)(struct context *ctx, struct conn *, struct server *server);
@@ -60,6 +61,7 @@ struct conn {
     conn_send_done_t    send_done;       /* write done handler */
     conn_close_t        close;           /* close handler */
     conn_active_t       active;          /* active? handler */
+    conn_restore_t      restore;         /* restore handler */
     conn_post_connect_t post_connect;    /* post connect handler */
     conn_swallow_msg_t  swallow_msg;     /* react on messages to be swallowed */
 
@@ -105,5 +107,6 @@ uint32_t conn_ncurr_conn(void);
 uint64_t conn_ntotal_conn(void);
 uint32_t conn_ncurr_cconn(void);
 bool conn_authenticated(struct conn *conn);
+rstatus_t event_add_out_with_conn(struct context *ctx, struct conn *conn, struct msg *msg);
 
 #endif
diff --git a/src/nc_core.c b/src/nc_core.c
@@ -24,6 +24,30 @@
 
 static uint32_t ctx_id; /* context generation */
 
+static void
+core_failed_servers_init(struct context *ctx)
+{
+    int i;
+
+    for (i = 0; i < 2; i++) {
+        array_init(&(ctx->failed_servers[i]), 10, sizeof(struct server *));
+    }
+}
+
+static void
+core_failed_servers_deinit(struct context *ctx)
+{
+    uint32_t i, n, nsize;
+
+    for (i = 0; i < 2; i++) {
+        nsize = array_n(&(ctx->failed_servers[i]));
+        for (n = 0; n < nsize; n++) {
+            array_pop(&(ctx->failed_servers[n]));
+        }
+        array_deinit(&(ctx->failed_servers[n]));
+    }
+}
+
 static rstatus_t
 core_calc_connections(struct context *ctx)
 {
@@ -60,6 +84,11 @@ core_ctx_create(struct instance *nci)
     ctx->stats = NULL;
     ctx->evb = NULL;
     array_null(&ctx->pool);
+    array_null(&(ctx->failed_servers[0]));
+    array_null(&(ctx->failed_servers[1]));
+    ctx->failed_idx = 0;
+    ctx->fails = &(ctx->failed_servers[0]);
+
     ctx->max_timeout = nci->stats_interval;
     ctx->timeout = ctx->max_timeout;
     ctx->max_nfd = 0;
@@ -93,6 +122,8 @@ core_ctx_create(struct instance *nci)
         return NULL;
     }
 
+    core_failed_servers_init(ctx);
+
     /* create stats per server pool */
     ctx->stats = stats_create(nci->stats_port, nci->stats_addr, nci->stats_interval,
                               nci->hostname, &ctx->pool);
@@ -261,6 +292,41 @@ core_error(struct context *ctx, struct conn *conn)
     core_close(ctx, conn);
 }
 
+static void
+retry_connection(struct context *ctx)
+{
+    struct array *servers;
+    int idx;
+    struct server *server;
+    int64_t now;
+    uint32_t i, nsize;
+    rstatus_t status;
+
+    servers = ctx->fails;
+    idx = (ctx->failed_idx == 0) ? 1 : 0;
+
+    ctx->failed_idx = idx;
+    ctx->fails = &(ctx->failed_servers[idx]);
+
+    now = nc_usec_now();
+    nsize = array_n(servers);
+    if (nsize == 0) {
+        return;
+    }
+
+    for (i = 0; i < nsize; i++) {
+        server = *(struct server **)array_pop(servers);
+        if (server->next_retry == 0 || server->next_retry < now) {
+            status = server_reconnect(ctx, server);
+            if (status != NC_OK) {
+                add_failed_server(ctx, server);
+            }
+        } else {
+            add_failed_server(ctx, server);
+        }
+    }
+}
+
 static void
 core_timeout(struct context *ctx)
 {
@@ -272,14 +338,14 @@ core_timeout(struct context *ctx)
         msg = msg_tmo_min();
         if (msg == NULL) {
             ctx->timeout = ctx->max_timeout;
-            return;
+            break;
         }
 
         /* skip over req that are in-error or done */
 
         if (msg->error || msg->done) {
             msg_tmo_delete(msg);
-            continue;
+            break;
         }
 
         /*
@@ -304,6 +370,8 @@ core_timeout(struct context *ctx)
 
         core_close(ctx, conn);
     }
+
+    retry_connection(ctx);
 }
 
 rstatus_t
@@ -324,6 +392,7 @@ core_core(void *arg, uint32_t events)
               conn->client ? 'c' : (conn->proxy ? 'p' : 's'), conn->sd);
 
     conn->events = events;
+    conn->restore(ctx, conn);
 
     /* error takes precedence over read | write */
     if (events & EVENT_ERR) {

diff --git a/src/nc_core.h b/src/nc_core.h
@@ -124,6 +124,10 @@ struct context {
     struct stats       *stats;      /* stats */
 
     struct array       pool;        /* server_pool[] */
+    struct array       failed_servers[2];   /* failed servers */
+    struct array       *fails;              /* ref of current fails server */
+
+    int                failed_idx;           /* current idx for failed servers */
     struct event_base  *evb;        /* event base */
     int                max_timeout; /* max timeout in msec */
     int                timeout;     /* timeout in msec */

diff --git a/src/nc_proxy.c b/src/nc_proxy.c
@@ -406,3 +406,8 @@ proxy_recv(struct context *ctx, struct conn *conn)
 
     return NC_OK;
 }
+
+void
+proxy_restore(struct context *ctx, struct conn *conn)
+{
+}
diff --git a/src/nc_proxy.h b/src/nc_proxy.h
@@ -30,5 +30,6 @@ rstatus_t proxy_each_deinit(void *elem, void *data);
 rstatus_t proxy_init(struct context *ctx);
 void proxy_deinit(struct context *ctx);
 rstatus_t proxy_recv(struct context *ctx, struct conn *conn);
+void proxy_restore(struct context *ctx, struct conn *conn);
 
 #endif
diff --git a/src/nc_response.c b/src/nc_response.c
@@ -142,9 +142,12 @@ static bool
 rsp_filter(struct context *ctx, struct conn *conn, struct msg *msg)
 {
     struct msg *pmsg;
+    struct server *server;
 
     ASSERT(!conn->client && !conn->proxy);
 
+    server = (struct server *)conn->owner;
+
     if (msg_empty(msg)) {
         ASSERT(conn->rmsg == NULL);
         log_debug(LOG_VERB, "filter empty rsp %"PRIu64" on s %d", msg->id,
@@ -204,6 +207,13 @@ rsp_filter(struct context *ctx, struct conn *conn, struct msg *msg)
     }
 
     if (pmsg->swallow) {
+        if (server->fail == FAIL_STATUS_ERR_TRY_HEARTBEAT) {
+            struct conn *c_conn;
+
+            c_conn = pmsg->owner; 
+            server_restore_from_heartbeat(server, c_conn);
+        }
+
         conn->swallow_msg(conn, pmsg, msg);
 
         conn->dequeue_outq(ctx, conn, pmsg);