forked from fghaas/openstacksummit2014-atlanta
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdrbd-race.patch
152 lines (136 loc) · 6.66 KB
/
drbd-race.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
From 0a1b4f08152866f85334da6a29d282f9c8500ff1 Mon Sep 17 00:00:00 2001
From: Vincent Untz <[email protected]>
Date: Mon, 12 May 2014 10:02:21 -0400
Subject: [PATCH] ha: Fix race on creation of drbd resources
The drbd OCF resource agent expects the drbd primitive and the drbd ms
resource to be created in the same transaction. When it doesn't, then
the drbd primitve will fail to be monitored, which can lead to fencing.
We didn't see that all the time because we create the ms resource fast,
so most of the time, it just works.
---
chef/cookbooks/postgresql/recipes/ha_storage.rb | 34 ++++++++++++++++++++++---
1 file changed, 30 insertions(+), 4 deletions(-)
diff --git a/chef/cookbooks/postgresql/recipes/ha_storage.rb b/chef/cookbooks/postgresql/recipes/ha_storage.rb
index 247f592..7f7e5a1 100644
--- a/chef/cookbooks/postgresql/recipes/ha_storage.rb
+++ b/chef/cookbooks/postgresql/recipes/ha_storage.rb
@@ -71,10 +71,33 @@
drbd_params = {}
drbd_params["drbd_resource"] = drbd_resource
+ # Careful here: when creating the DRBD resources, we should in theory create
+ # the primitive and the ms resources in one transaction. This is what is
+ # expected by the drbd OCF agent, which checks (for the primitive) that the
+ # meta bits (like clone-max), that are supposed to be inherited from the
+ # required ms resource, are present.
+ #
+ # When that check fails, we can see this error:
+ # ERROR: meta parameter misconfigured, expected clone-max -le 2, but found unset.
+ # which will cause fencing (the resource is not supposed to start, but the
+ # monitoring op fails, so pacemaker tries to stop the resource to be safe,
+ # and that fails too, leading to fencing).
+ #
+ # However, we cannot create the two resources in one transaction. So what we
+ # do instead is on initial creation (and only in that case), we mark the drbd
+ # primitive as unmanaged (that will cause the failure to not be fatal, with
+ # no fencing), and we clean it up + mark it as managed after we create the ms
+ # resource.
+
pacemaker_primitive drbd_primitive do
agent "ocf:linbit:drbd"
params drbd_params
op postgres_op
+ # See big comment above as to why we do that. We know that the founder will
+ # go first here, so we only do this on the founder.
+ meta ({
+ "is-managed" => "false"
+ }) if (CrowbarPacemakerHelper.is_cluster_founder?(node) && ! ::Kernel.system("crm configure show #{drbd_primitive} &> /dev/null"))
action :create
end
@@ -90,12 +113,15 @@
action :create
end
- # This is needed because we don't create all the pacemaker resources in the
- # same transaction
- execute "Cleanup #{drbd_primitive} on #{ms_name} start" do
- command "sleep 2; crm resource cleanup #{drbd_primitive}"
+ # See big comment above as to why we do that. We know that the founder will
+ # go first here, so we only do this on the founder, but in short: this is
+ # needed because we don't create all the pacemaker resources in the same
+ # transaction.
+ execute "Cleanup and manage #{drbd_primitive} on #{ms_name} start" do
+ command "sleep 2; crm resource cleanup #{drbd_primitive}; crm_resource --resource #{drbd_primitive} --delete-parameter \"is-managed\" --meta || :"
action :nothing
subscribes :run, "pacemaker_ms[#{ms_name}]", :immediately
+ only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
end
--
1.9.1
From f80f43c4c6ead879113f69695bc40797fe4ed1bd Mon Sep 17 00:00:00 2001
From: Vincent Untz <[email protected]>
Date: Mon, 12 May 2014 10:07:37 -0400
Subject: [PATCH] ha: Fix race on creation of drbd resources
The drbd OCF resource agent expects the drbd primitive and the drbd ms
resource to be created in the same transaction. When it doesn't, then
the drbd primitve will fail to be monitored, which can lead to fencing.
We didn't see that all the time because we create the ms resource fast,
so most of the time, it just works.
---
chef/cookbooks/rabbitmq/recipes/ha.rb | 34 ++++++++++++++++++++++++++++++----
1 file changed, 30 insertions(+), 4 deletions(-)
diff --git a/chef/cookbooks/rabbitmq/recipes/ha.rb b/chef/cookbooks/rabbitmq/recipes/ha.rb
index 5292ef6..cef35d8 100644
--- a/chef/cookbooks/rabbitmq/recipes/ha.rb
+++ b/chef/cookbooks/rabbitmq/recipes/ha.rb
@@ -62,10 +62,33 @@
drbd_params = {}
drbd_params["drbd_resource"] = drbd_resource
+ # Careful here: when creating the DRBD resources, we should in theory create
+ # the primitive and the ms resources in one transaction. This is what is
+ # expected by the drbd OCF agent, which checks (for the primitive) that the
+ # meta bits (like clone-max), that are supposed to be inherited from the
+ # required ms resource, are present.
+ #
+ # When that check fails, we can see this error:
+ # ERROR: meta parameter misconfigured, expected clone-max -le 2, but found unset.
+ # which will cause fencing (the resource is not supposed to start, but the
+ # monitoring op fails, so pacemaker tries to stop the resource to be safe,
+ # and that fails too, leading to fencing).
+ #
+ # However, we cannot create the two resources in one transaction. So what we
+ # do instead is on initial creation (and only in that case), we mark the drbd
+ # primitive as unmanaged (that will cause the failure to not be fatal, with
+ # no fencing), and we clean it up + mark it as managed after we create the ms
+ # resource.
+
pacemaker_primitive drbd_primitive do
agent "ocf:linbit:drbd"
params drbd_params
op rabbitmq_op
+ # See big comment above as to why we do that. We know that the founder will
+ # go first here, so we only do this on the founder.
+ meta ({
+ "is-managed" => "false"
+ }) if (CrowbarPacemakerHelper.is_cluster_founder?(node) && ! ::Kernel.system("crm configure show #{drbd_primitive} &> /dev/null"))
action :create
end
@@ -81,12 +104,15 @@
action :create
end
- # This is needed because we don't create all the pacemaker resources in the
- # same transaction
- execute "Cleanup #{drbd_primitive} on #{ms_name} start" do
- command "sleep 2; crm resource cleanup #{drbd_primitive}"
+ # See big comment above as to why we do that. We know that the founder will
+ # go first here, so we only do this on the founder, but in short: this is
+ # needed because we don't create all the pacemaker resources in the same
+ # transaction.
+ execute "Cleanup and manage #{drbd_primitive} on #{ms_name} start" do
+ command "sleep 2; crm resource cleanup #{drbd_primitive}; crm_resource --resource #{drbd_primitive} --delete-parameter \"is-managed\" --meta || :"
action :nothing
subscribes :run, "pacemaker_ms[#{ms_name}]", :immediately
+ only_if { CrowbarPacemakerHelper.is_cluster_founder?(node) }
end
end
--
1.9.1