[PATCH] log: Try to fix stuck replication on some cases
Export this patch
Signed-off-by: Louis Solofrizzo <lsolofrizzo@scaleway.com>
---
log.c | 24 ++++++++++++++++++++++--
node.h | 3 +++
2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/log.c b/log.c
index d7cd67a..d761f14 100644
--- a/log.c
+++ b/log.c
@@ -660,8 +660,28 @@ void libfloat_append_entries_response(libfloat_ctx_t *ctx, libfloat_rpc_append_e
return;
}
- ERROR(ctx, "libfloat_append_entries_response: node %d: received current_index (%u) older than replicated_log (%u)",
- node->id, resp->current_index, node->replicated_log);
+ if (node->announced_log == resp->current_index)
+ {
+ node->announced_log_count++;
+ if (node->announced_log_count == 20)
+ {
+ /* We have 20 consecutive hertbeats telling the same story, let's believe it */
+
+ node->announced_log_count = 0;
+ node->next_log_to_send = max(resp->current_index + 1, 1);
+ node->replicated_log = resp->current_index;
+ libfloat_send_append_entries(ctx, node, false);
+ return;
+ }
+ }
+ else
+ {
+ node->announced_log_count = 0;
+ node->announced_log = resp->current_index;
+ }
+
+ ERROR(ctx, "libfloat_append_entries_response: node %d: received current_index (%u) older than replicated_log (%u) (count=%lu)",
+ node->id, resp->current_index, node->replicated_log, node->announced_log_count);
return;
}
diff --git a/node.h b/node.h
index f49ce9c..9f0bed8 100644
--- a/node.h
+++ b/node.h
@@ -7,6 +7,9 @@ typedef struct {
libfloat_entry_id_t next_log_to_send; /*!< Next log to send to this node */
libfloat_entry_id_t replicated_log; /*!< Last known replicated log of this node */
+ libfloat_entry_id_t announced_log;
+ uint64_t announced_log_count;
+
uint8_t has_voted_for_me : 1;
uint8_t is_up_to_date : 1;
--
2.36.1
LG
May 31, 2022 13:17:28 Louis Solofrizzo <lsolofrizzo@scaleway.com>: