Skip to content

Commit

Permalink
Fix: Raise an exception as a rapid return of ssh-related operations t…
Browse files Browse the repository at this point in the history
…o prevent hang (bsc#1228899) (#1520)

port #1508 , also add functional test for ssh blocking case
  • Loading branch information
liangxin1300 authored Sep 2, 2024
2 parents 951595d + 37ec6ee commit 6239959
Show file tree
Hide file tree
Showing 14 changed files with 162 additions and 14 deletions.
15 changes: 15 additions & 0 deletions .github/workflows/crmsh-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,21 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
flags: integration

functional_test_blocking_ssh:
runs-on: ubuntu-24.04
timeout-minutes: 40
steps:
- uses: actions/checkout@v4
- name: functional test for blocking ssh
run: |
echo '{ "exec-opts": ["native.cgroupdriver=systemd"] }' | sudo tee /etc/docker/daemon.json
sudo systemctl restart docker.service
$DOCKER_SCRIPT `$GET_INDEX_OF cluster_blocking_ssh`
- uses: codecov/codecov-action@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
flags: integration

original_regression_test:
runs-on: ubuntu-24.04
timeout-minutes: 40
Expand Down
4 changes: 2 additions & 2 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ coverage:
threshold: 0.35%
codecov:
notify:
after_n_builds: 27
after_n_builds: 28
comment:
after_n_builds: 27
after_n_builds: 28
layout: "condensed_header, flags, files, condensed_footer"
1 change: 1 addition & 0 deletions crmsh/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,7 @@ def get(self, value):
'lock_timeout': opt_string('120'),
'add_advised_op_values': opt_boolean('yes'),
'OCF_1_1_SUPPORT': opt_boolean('yes'),
'no_ssh': opt_boolean('no'),
'obscure_pattern': opt_string('passw*')
},
'path': {
Expand Down
2 changes: 2 additions & 0 deletions crmsh/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -448,4 +448,6 @@

# Commands that are deprecated and hidden from UI
HIDDEN_COMMANDS = {'ms'}

NO_SSH_ERROR_MSG = "ssh-related operations are disabled. crmsh works in local mode."
# vim:ts=4:sw=4:et:
3 changes: 3 additions & 0 deletions crmsh/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,6 +369,9 @@ def run():
else:
upgradeutil.upgrade_if_needed()
return main_input_loop(context, user_args)
except utils.NoSSHError as msg:
logger.error('%s', msg)
sys.exit(1)
except KeyboardInterrupt:
if config.core.debug:
raise
Expand Down
14 changes: 11 additions & 3 deletions crmsh/report/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import crmsh.report.sh
import crmsh.user_of_host
from crmsh import utils as crmutils
from crmsh import constants as crmconstants
from crmsh import config, log, tmpfiles, ui_cluster
from crmsh.sh import ShellUtils

Expand Down Expand Up @@ -331,13 +332,20 @@ def process_dest(context: Context) -> None:
context.dest_path = f"{context.dest_dir}/{context.dest}.tar{context.compress_suffix}"


def local_mode(context: Context) -> bool:
return context.single or (len(context.node_list) == 1 and context.node_list[0] == context.me)


def process_node_list(context: Context) -> None:
if not context.node_list:
if not local_mode(context) and config.core.no_ssh:
logger.error('%s', crmconstants.NO_SSH_ERROR_MSG)
context.single = True
if context.single:
context.node_list = [context.me]
elif not context.node_list:
context.node_list = crmutils.list_cluster_nodes()
if not context.node_list:
raise utils.ReportGenericError("Could not figure out a list of nodes; is this a cluster node?")
if context.single:
context.node_list = [context.me]

for node in context.node_list[:]:
if node == context.me:
Expand Down
20 changes: 15 additions & 5 deletions crmsh/ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,15 @@ def do_start(self, context, *args):
service_check_list.append("corosync-qdevice.service")

service_manager = ServiceManager()
for node in node_list[:]:
if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
logger.info("The cluster stack already started on {}".format(node))
node_list.remove(node)
try:
for node in node_list[:]:
if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
logger.info("The cluster stack already started on {}".format(node))
node_list.remove(node)
except utils.NoSSHError as msg:
logger.error('%s', msg)
logger.info("Please try 'crm cluster start' on each node")
return
if not node_list:
return

Expand Down Expand Up @@ -240,7 +245,12 @@ def do_stop(self, context, *args):
Stops the cluster stack on all nodes or specific node(s)
'''
node_list = parse_option_for_nodes(context, *args)
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
try:
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
except utils.NoSSHError as msg:
logger.error('%s', msg)
logger.info("Please try 'crm cluster stop' on each node")
return
if not node_list:
return
logger.debug(f"stop node list: {node_list}")
Expand Down
4 changes: 4 additions & 0 deletions crmsh/upgradeutil.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import crmsh.parallax
import crmsh.utils
from crmsh import user_of_host
import crmsh.config
import crmsh.constants
from crmsh.prun import prun


Expand Down Expand Up @@ -152,6 +154,8 @@ def ask(msg: str):


def upgrade_if_needed():
if crmsh.config.core.no_ssh:
return
if os.geteuid() != 0:
return
if not crmsh.utils.can_ask(background_wait=False):
Expand Down
3 changes: 3 additions & 0 deletions crmsh/user_of_host.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ def user_of(self, host):

def user_pair_for_ssh(self, host: str) -> typing.Tuple[str, str]:
"""Return (local_user, remote_user) pair for ssh connection"""
if config.core.no_ssh:
from .utils import NoSSHError
raise NoSSHError(constants.NO_SSH_ERROR_MSG)
local_user = None
remote_user = None
try:
Expand Down
16 changes: 16 additions & 0 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3155,4 +3155,20 @@ def ansible_facts(module_name) -> dict:
out = out[bracket_pos:]
json_tree = json.loads(out)
return json_tree['ansible_facts']


class NoSSHError(Exception):
pass


def ssh_command():
"""
Wrapper function for ssh command
When ssh between cluster nodes is blocked, core.no_ssh
should be set to 'yes', then this function will raise NoSSHError
"""
if config.core.no_ssh:
raise NoSSHError(constants.NO_SSH_ERROR_MSG)
return "ssh"
# vim:ts=4:sw=4:et:
1 change: 1 addition & 0 deletions data-manifest
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ test/features/bootstrap_options.feature
test/features/bootstrap_sbd_delay.feature
test/features/bootstrap_sbd_normal.feature
test/features/cluster_api.feature
test/features/cluster_blocking_ssh.feature
test/features/configure_bugs.feature
test/features/constraints_bugs.feature
test/features/corosync_ui.feature
Expand Down
1 change: 1 addition & 0 deletions etc/crm.conf.in
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
; report_tool_options =
; lock_timeout = 120
; add_advised_op_values = yes
; no_ssh = no

; set OCF_1_1_SUPPORT to yes is to fully turn on OCF 1.1 feature once the corresponding CIB detected.
; OCF_1_1_SUPPORT = yes
Expand Down
77 changes: 77 additions & 0 deletions test/features/cluster_blocking_ssh.feature
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
Feature: cluster testing with ssh blocked

Need nodes: hanode1 hanode2

Scenario: Cluster testing with ssh blocked (bsc#1228899)
Given Cluster service is "stopped" on "hanode1"
And Cluster service is "stopped" on "hanode2"
# without bootstrap, write corosync.conf and authkey directly
When Run "dd if=/dev/urandom of=/etc/corosync/authkey bs=1 count=256" on "hanode1"
And Write multi lines to file "/etc/corosync/corosync.conf" on "hanode1"
"""
totem {
version: 2
cluster_name: hacluster
transport: knet
token: 5000
join: 60
max_messages: 20
token_retransmits_before_loss_const: 10
crypto_hash: sha1
crypto_cipher: aes256
}
quorum {
provider: corosync_votequorum
two_node: 1
}
logging {
to_logfile: yes
logfile: /var/log/cluster/corosync.log
to_syslog: yes
timestamp: on
}
nodelist {
node {
ring0_addr: @hanode1.ip.default
name: hanode1
nodeid: 1
}
node {
ring0_addr: @hanode2.ip.default
name: hanode2
nodeid: 2
}
}
"""
And Run "scp /etc/corosync/authkey /etc/corosync/corosync.conf hanode2:/etc/corosync/" on "hanode1"
And Run "systemctl start pacemaker" on "hanode1"
And Run "systemctl start pacemaker" on "hanode2"
And Wait for DC
Then Cluster service is "started" on "hanode1"
And Cluster service is "started" on "hanode2"
And Online nodes are "hanode1 hanode2"
# block ssh between nodes
When Run "systemctl start firewalld" on "hanode2"
And Run "firewall-cmd --zone=public --add-rich-rule='rule port port=22 protocol=tcp drop' --permanent && firewall-cmd --reload" on "hanode2"
And Try "ssh -o ConnectTimeout=5 hanode2" on "hanode1"
Then Except "ssh: connect to host hanode2 port 22: Connection timed out" in stderr
When Run "timeout 5s crm report || echo "timeout"" on "hanode1"
Then Expected "timeout" in stdout
When Write multi lines to file "/etc/crm/crm.conf" on "hanode1"
"""
[core]
no_ssh = yes
"""
When Run "crm configure property stonith-enabled=false" on "hanode1"
And Run "crm report -d /tmp/report" on "hanode1"
Then Directory "/tmp/report/hanode1" created
Then Directory "/tmp/report/hanode2" not created
Then Expected "ERROR: ssh-related operations are disabled. crmsh works in local mode." in stderr
Then Run "crm status" OK on "hanode1"
When Run "crm cluster stop --all" on "hanode1"
Then Expected "ERROR: ssh-related operations are disabled. crmsh works in local mode." in stderr
15 changes: 11 additions & 4 deletions test/unittests/test_report_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -455,23 +455,30 @@ def test_process_arguments_value_error(self):
core.process_arguments(mock_ctx_inst)
self.assertEqual("The start time must be before the finish time", str(err.exception))

@mock.patch('crmsh.report.core.local_mode')
@mock.patch('crmsh.utils.list_cluster_nodes')
def test_process_node_list_exception(self, mock_list_nodes):
mock_ctx_inst = mock.Mock(node_list=[])
def test_process_node_list_exception(self, mock_list_nodes, mock_local_mode):
mock_local_mode.return_value = False
config.core.no_ssh = False
mock_ctx_inst = mock.Mock(node_list=[], single=False)
mock_list_nodes.return_value = []
with self.assertRaises(utils.ReportGenericError) as err:
core.process_node_list(mock_ctx_inst)
self.assertEqual("Could not figure out a list of nodes; is this a cluster node?", str(err.exception))

@mock.patch('crmsh.report.core.local_mode')
@mock.patch('crmsh.utils.list_cluster_nodes')
def test_process_node_list_single(self, mock_list_nodes):
def test_process_node_list_single(self, mock_list_nodes, mock_local_mode):
mock_local_mode.return_value = True
mock_ctx_inst = mock.Mock(node_list=["node1", "node2"], single=True, me="node1")
core.process_node_list(mock_ctx_inst)

@mock.patch('crmsh.report.core.local_mode')
@mock.patch('logging.Logger.error')
@mock.patch('crmsh.utils.ping_node')
@mock.patch('crmsh.utils.list_cluster_nodes')
def test_process_node_list(self, mock_list_nodes, mock_ping, mock_error):
def test_process_node_list(self, mock_list_nodes, mock_ping, mock_error, mock_local_mode):
mock_local_mode.return_value = False
mock_ctx_inst = mock.Mock(node_list=["node1", "node2"], single=False, me="node1")
mock_ping.side_effect = ValueError("error")
core.process_node_list(mock_ctx_inst)
Expand Down

0 comments on commit 6239959

Please sign in to comment.