This thread contains a patchset. You're looking at the original emails,
but you may wish to use the patch review UI.
Review patch
1
[PATCH] log: Try to fix stuck replication on some cases
Signed-off-by: Louis Solofrizzo <lsolofrizzo@scaleway.com>
---
log.c | 24 ++++++++++++++++++++++--
node.h | 3 +++
2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/log.c b/log.c
index d7cd67a..d761f14 100644
--- a/log.c
+++ b/log.c
@@ -660,8 +660,28 @@ void libfloat_append_entries_response(libfloat_ctx_t *ctx, libfloat_rpc_append_e
return;
}
- ERROR(ctx, "libfloat_append_entries_response: node %d: received current_index (%u) older than replicated_log (%u)",
- node->id, resp->current_index, node->replicated_log);
+ if (node->announced_log == resp->current_index)
+ {
+ node->announced_log_count++;
+ if (node->announced_log_count == 20)
+ {
+ /* We have 20 consecutive hertbeats telling the same story, let's believe it */
+
+ node->announced_log_count = 0;
+ node->next_log_to_send = max(resp->current_index + 1, 1);
+ node->replicated_log = resp->current_index;
+ libfloat_send_append_entries(ctx, node, false);
+ return;
+ }
+ }
+ else
+ {
+ node->announced_log_count = 0;
+ node->announced_log = resp->current_index;
+ }
+
+ ERROR(ctx, "libfloat_append_entries_response: node %d: received current_index (%u) older than replicated_log (%u) (count=%lu)",
+ node->id, resp->current_index, node->replicated_log, node->announced_log_count);
return;
}
diff --git a/node.h b/node.h
index f49ce9c..9f0bed8 100644
--- a/node.h
+++ b/node.h
@@ -7,6 +7,9 @@ typedef struct {
libfloat_entry_id_t next_log_to_send; /*!< Next log to send to this node */
libfloat_entry_id_t replicated_log; /*!< Last known replicated log of this node */
+ libfloat_entry_id_t announced_log;
+ uint64_t announced_log_count;
+
uint8_t has_voted_for_me : 1;
uint8_t is_up_to_date : 1;
--
2.36.1
LG
May 31, 2022 13:17:28 Louis Solofrizzo <lsolofrizzo@scaleway.com>:
> Signed-off-by: Louis Solofrizzo <lsolofrizzo@scaleway.com>
> ---
> log.c | 24 ++++++++++++++++++++++--
> node.h | 3 +++
> 2 files changed, 25 insertions(+), 2 deletions(-)
>
> diff --git a/log.c b/log.c
> index d7cd67a..d761f14 100644
> --- a/log.c
> +++ b/log.c
> @@ -660,8 +660,28 @@ void libfloat_append_entries_response(libfloat_ctx_t *ctx, libfloat_rpc_append_e
> return;
> }
>
> - ERROR(ctx, "libfloat_append_entries_response: node %d: received current_index (%u) older than replicated_log (%u)",
> - node->id, resp->current_index, node->replicated_log);
> + if (node->announced_log == resp->current_index)
> + {
> + node->announced_log_count++;
> + if (node->announced_log_count == 20)
> + {
> + /* We have 20 consecutive hertbeats telling the same story, let's believe it */
> +
> + node->announced_log_count = 0;
> + node->next_log_to_send = max(resp->current_index + 1, 1);
> + node->replicated_log = resp->current_index;
> + libfloat_send_append_entries(ctx, node, false);
> + return;
> + }
> + }
> + else
> + {
> + node->announced_log_count = 0;
> + node->announced_log = resp->current_index;
> + }
> +
> + ERROR(ctx, "libfloat_append_entries_response: node %d: received current_index (%u) older than replicated_log (%u) (count=%lu)",
> + node->id, resp->current_index, node->replicated_log, node->announced_log_count);
> return;
> }
>
> diff --git a/node.h b/node.h
> index f49ce9c..9f0bed8 100644
> --- a/node.h
> +++ b/node.h
> @@ -7,6 +7,9 @@ typedef struct {
> libfloat_entry_id_t next_log_to_send; /*!< Next log to send to this node */
> libfloat_entry_id_t replicated_log; /*!< Last known replicated log of this node */
>
> + libfloat_entry_id_t announced_log;
> + uint64_t announced_log_count;
> +
> uint8_t has_voted_for_me : 1;
> uint8_t is_up_to_date : 1;
>
> --
> 2.36.1