forked from kubernetes-sigs/blob-csi-driver
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
393 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
FROM registry.k8s.io/build-image/debian-base:bullseye-v1.4.3 | ||
|
||
RUN apt update && apt upgrade -y && apt-mark unhold libcap2 && clean-install ca-certificates uuid-dev util-linux mount udev wget e2fsprogs nfs-common netbase procps conntrack iptables bind9-host iproute2 bash | ||
|
||
COPY aznfswatchdog /usr/sbin/aznfswatchdog | ||
|
||
CMD [ "aznfswatchdog" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,358 @@ | ||
#!/bin/bash | ||
|
||
# -------------------------------------------------------------------------------------------- | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# Licensed under the MIT License. See License.txt in the project root for license information. | ||
# -------------------------------------------------------------------------------------------- | ||
|
||
# | ||
# How often does the watchdog look for unmounts and/or IP address changes for | ||
# Blob endpoints. | ||
# | ||
MONITOR_INTERVAL_SECS=5 | ||
|
||
# How often do we check for change in FQDN->IP? | ||
IP_CHANGE_DETECTION_FREQUENCY=60 | ||
|
||
# | ||
# Remove unmounted entries only if MOUNTMAP has not been changed till MONITOR_INTERVAL_SECS seconds. | ||
# Don't set it below 3 minutes. | ||
# | ||
MOUNTMAP_INACTIVITY_SECS=300 | ||
|
||
# | ||
# Don't perform mountmap and iptables rule cleanup for unmounted filesystems. | ||
# This can be set if you want lazy umount to work. | ||
# | ||
AZNFS_SKIP_UNMOUNT_CLEANUP="${AZNFS_SKIP_UNMOUNT_CLEANUP:-0}" | ||
|
||
# TIMEWAIT timeout to be used for conntrack entries. | ||
AZNFS_TIMEWAIT_TIMEOUT="${AZNFS_TIMEWAIT_TIMEOUT:-65}" | ||
|
||
next_ip_change_detection_epoch=0 | ||
|
||
# Load common aznfs helpers. | ||
. /opt/microsoft/aznfs/common.sh | ||
|
||
|
||
# Create /opt/microsoft/aznfs/randbytes if not already created. | ||
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then | ||
dd if=/dev/urandom of=/opt/microsoft/aznfs/randbytes bs=256 count=1 | ||
fi | ||
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then | ||
uuidgen > /opt/microsoft/aznfs/randbytes | ||
fi | ||
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then | ||
date | md5sum | awk '{print $1}' > /opt/microsoft/aznfs/randbytes | ||
fi | ||
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then | ||
date > /opt/microsoft/aznfs/randbytes | ||
fi | ||
chattr +i /opt/microsoft/aznfs/randbytes | ||
|
||
# | ||
# Hash for storing how many times we have seen a conntrack entry in SYN_SENT state. | ||
# Used for finding if some entry is stuck in SYN_SENT state due to a bug in older | ||
# kernels. If we find an entry stuck for more than a certain time in SYN_SENT state | ||
# we delete the entry so that kernel looks up fresh NAT rules and creates a new entry. | ||
# | ||
declare -A cthash_synsent | ||
declare -A cthash_synsent_now | ||
|
||
reconcile_conntrack_one() | ||
{ | ||
local l_ip=$1 | ||
local l_sport=$2 | ||
local l_dport=$3 | ||
local l_nfsip=$4 | ||
|
||
key="${l_ip}:${l_sport}:${l_dport}:${l_nfsip}" | ||
let cthash_synsent[$key]++ | ||
let cthash_synsent_now[$key]++ | ||
|
||
# | ||
# We are called every 5 secs, so this deletes an entry stuck in | ||
# SYN_SENT for 25/30 secs. | ||
# | ||
if [ ${cthash_synsent[$key]} -ge 5 ]; then | ||
cmd="conntrack -D -p tcp -d $l_ip -r $l_nfsip --sport $l_sport --dport $l_dport" | ||
wecho "Deleting conntrack entry stuck in SYN_SENT state [$cmd]" | ||
|
||
eval $cmd | ||
if [ $? -ne 0 ]; then | ||
eecho "Failed to delete conntrack entry [$cmd]!" | ||
else | ||
unset cthash_synsent[$key] | ||
fi | ||
fi | ||
} | ||
|
||
reconcile_conntrack() | ||
{ | ||
local l_ip=$1 | ||
local l_nfsip=$2 | ||
|
||
# cthash_synsent_now holds only entries found in this call, so clear it before starting. | ||
unset cthash_synsent_now | ||
declare -A cthash_synsent_now | ||
|
||
# | ||
# For mounts with nconnect, there could be more than one conntrack entries to the same | ||
# proxy IP, but with different local ports. We must track them separately. | ||
# | ||
IFS=$'\n' output111=$(conntrack -L -p tcp -d $l_ip -r $l_nfsip --dport 111 --state SYN_SENT 2>/dev/null) | ||
IFS=$'\n' output2048=$(conntrack -L -p tcp -d $l_ip -r $l_nfsip --dport 2048 --state SYN_SENT 2>/dev/null) | ||
output="$output111"$'\n'"$output2048" | ||
|
||
if [ -n "$output" ]; then | ||
for line in $output; do | ||
# XXX Remove this log after running for few days. | ||
vecho "$line" | ||
|
||
matchstr=".* SYN_SENT src=.* dst=$l_ip sport=([0-9]+) dport=([0-9]+).*" | ||
if [[ "$line" =~ $matchstr ]]; then | ||
l_sport=${BASH_REMATCH[1]} | ||
l_dport=${BASH_REMATCH[2]} | ||
reconcile_conntrack_one $l_ip $l_sport $l_dport $l_nfsip | ||
fi | ||
done | ||
fi | ||
|
||
# | ||
# Any port that is not stuck now, means it's recovered since the last time and can be removed | ||
# from the global cthash_synsent map. | ||
# | ||
for key in "${!cthash_synsent[@]}"; do | ||
if [ ! -v cthash_synsent_now[$key] ]; then | ||
unset cthash_synsent[$key] | ||
fi | ||
done | ||
} | ||
|
||
vecho "Starting aznfswatchdog..." | ||
|
||
# SIGTERM handler. | ||
sigterm_handler() | ||
{ | ||
wecho "SIGTERM received, exiting..." | ||
exit 0 | ||
} | ||
trap sigterm_handler SIGTERM | ||
|
||
# Dump NAT table once on startup in case we have reported conflicts. | ||
vecho "NAT table:\n$(iptables-save -t nat)" | ||
conntrack -L > /dev/null | ||
|
||
# conntrack timewait timeout higher than the TCP timewait timeout value isn't very valuable. | ||
conntrack_timeo_timew=$(cat /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait 2>/dev/null) | ||
if [ $? -eq 0 -a $conntrack_timeo_timew -gt $AZNFS_TIMEWAIT_TIMEOUT ]; then | ||
vecho "Changing /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait [$conntrack_timeo_timew -> $AZNFS_TIMEWAIT_TIMEOUT]" | ||
echo $AZNFS_TIMEWAIT_TIMEOUT > /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait | ||
fi | ||
|
||
if ! chattr -f +i $MOUNTMAP; then | ||
wecho "chattr does not work for ${MOUNTMAP}!" | ||
fi | ||
|
||
|
||
|
||
# | ||
# Watchdog for monitoring unmounts and more importantly change in blob endpoint | ||
# addresses possibly as a result of migration. | ||
# | ||
while :; do | ||
sleep $MONITOR_INTERVAL_SECS | ||
|
||
# | ||
# TODO: Add a function reconcile_mountmap() and call it from here. This | ||
# should reconstruct the MOUNTMAP file from findmnt and output of | ||
# iptables. This will be added in subsequent revisions. | ||
# | ||
|
||
epoch_now=$(date +%s) | ||
|
||
# | ||
# Go over all lines in $MOUNTMAP and check them for two things: | ||
# 1. Is that entry still in use by at least one aznfs mount, if not remove the entry. | ||
# 2. Has the Blob endpoint address changed from what is stored? | ||
# If yes, update DNAT rule to point to the new address and update entry accordingly. | ||
# | ||
# Sample line in $MOUNTMAP. | ||
# account.blob.preprod.core.windows.net 10.100.100.100 52.230.170.200 | ||
# | ||
# where the format is | ||
# blobendpoint_fqdn proxy_ip blobendpoint_ip | ||
# | ||
# We store the mtime of MOUNTMAP while inside the lock so that if any mount helper process | ||
# updates it after this we will skip modification for sake of safety. We will come to it | ||
# in the next iteration when it's safer. | ||
# | ||
exec {fd}<$MOUNTMAP | ||
flock -e $fd | ||
mtime_mountmap=$(stat -c%Y $MOUNTMAP) | ||
IFS=$'\n' lines=$(cat $MOUNTMAP) | ||
flock -u $fd | ||
exec {fd}<&- | ||
|
||
do_ip_change_detection=false | ||
if [ $epoch_now -ge $next_ip_change_detection_epoch ]; then | ||
do_ip_change_detection=true | ||
next_ip_change_detection_epoch=$(expr $(date +%s) + $IP_CHANGE_DETECTION_FREQUENCY) | ||
fi | ||
|
||
# | ||
# Do unmount GC only if MOUNTMAP file is not modified in the last | ||
# MOUNTMAP_INACTIVITY_SECS seconds. We don't want to incorrectly delete an | ||
# entry while some aznfs mount is ongoing. | ||
# | ||
do_unmount_gc=false | ||
if [ "$AZNFS_SKIP_UNMOUNT_CLEANUP" == "0" ]; then | ||
if [ $epoch_now -ge $(expr $mtime_mountmap + $MOUNTMAP_INACTIVITY_SECS) ]; then | ||
do_unmount_gc=true | ||
fi | ||
fi | ||
|
||
# | ||
# findmnt must be done after reading MOUNTMAP so that if we come accross a | ||
# MOUNTMAP entry whose proxy_ip is not used by any existing mount, we know | ||
# for sure that it's not in use by any mount and can be removed. | ||
# | ||
findmnt=$(findmnt --raw --noheading -o MAJ:MIN,FSTYPE,SOURCE,TARGET,OPTIONS -t nfs 2>&1) | ||
|
||
# | ||
# For no matching mounts also, findmnt exits with a failure return, so check | ||
# for both exit status and non-empty error o/p. | ||
# | ||
if [ $? -ne 0 -a -n "$findmnt" ]; then | ||
eecho "${findmnt}." | ||
eecho "[FATAL] findmnt failed unexpectedly!" | ||
eecho "[FATAL] Aznfswatchdog service is exiting, will not monitor Azure NFS shares." | ||
eecho "[FATAL] Please contact Microsoft support before using any Blob NFS shares." | ||
# This usually indicates some non-transient issue, bail out. | ||
exit 1 | ||
fi | ||
|
||
for line in $lines; do | ||
if [ -z "$line" ]; then | ||
continue | ||
fi | ||
|
||
# | ||
# MOUNTMAP line is of the form: | ||
# account.blob.preprod.core.windows.net <local ip> <public ip> [<PID>] | ||
# | ||
IFS=" " read l_host l_ip l_nfsip <<< "$line" | ||
|
||
if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" ]; then | ||
wecho "[FATAL] Deleting invalid line in $MOUNTMAP: [$line]!" | ||
l_mtime=$(ensure_mountmap_not_exist "$line") | ||
[ $? -eq 0 ] && mtime_mountmap=$l_mtime | ||
continue | ||
fi | ||
|
||
# Since we added it to the MOUNTMAP file, it cannot be invalid. | ||
if ! is_private_ip "$l_ip"; then | ||
wecho "[FATAL] local ip ($l_ip) is invalid!" | ||
l_mtime=$(ensure_mountmap_not_exist "$line") | ||
[ $? -eq 0 ] && mtime_mountmap=$l_mtime | ||
continue | ||
fi | ||
|
||
# Since we added it to the MOUNTMAP file, it cannot be invalid. | ||
if ! is_valid_ipv4_address "$l_nfsip"; then | ||
wecho "[FATAL] Blob endpoint ip ($l_nfsip) is invalid!" | ||
l_mtime=$(ensure_mountmap_not_exist "$line") | ||
[ $? -eq 0 ] && mtime_mountmap=$l_mtime | ||
continue | ||
fi | ||
|
||
# | ||
# Delete entry from MOUNTMAP if there are no mounted shares on that host. | ||
# As long as we have at least one mount using the MOUNTMAP entry, we leave | ||
# it around. | ||
# | ||
if ! echo "$findmnt" | grep " nfs ${l_ip}:" >/dev/null; then | ||
if $do_unmount_gc; then | ||
pecho "No mounted shares for host $l_host, deleting from ${MOUNTMAP} [$line]." | ||
|
||
# Delete IFF mountmap is not changed since we read it above. | ||
l_mtime=$(ensure_mountmap_not_exist "$line" "$mtime_mountmap") | ||
|
||
# | ||
# Update ifmatch time in case of successful updation of MOUNTMAP, | ||
# so that we can distinguish between MOUNTMAP mtime changing because | ||
# of our action or some mount helper changing it. In the former case | ||
# it's safe to update the MOUNTMAP, so update mtime_mountmap to the | ||
# mtime after this update. | ||
# | ||
[ $? -eq 0 ] && mtime_mountmap=$l_mtime | ||
continue | ||
fi | ||
else | ||
# | ||
# Verify that iptable entry should be present for corresponding | ||
# MOUNTMAP entry if the share is not unmounted. | ||
# | ||
# Note: This is extra protection in case user flushes the iptable | ||
# entries or removes it by mistake. This should not be | ||
# required normally. | ||
# | ||
# We also reconcile conntrack entries stuck in some bad states which | ||
# may hamper communication, f.e., in older kernels there's a bug due to | ||
# which conntrack entry may get stuck in SYN_SENT state if client | ||
# reuse the source port and keep retransmitting SYNs before the entry | ||
# can timeout. | ||
# | ||
reconcile_conntrack "$l_ip" "$l_nfsip" | ||
verify_iptable_entry "$l_ip" "$l_nfsip" | ||
|
||
fi | ||
|
||
# | ||
# We do IP change detection less frequently than unmount detection | ||
# since it will cause DNS calls on network. | ||
# | ||
if ! $do_ip_change_detection; then | ||
continue | ||
fi | ||
|
||
# | ||
# Check if blob endpoint IP address changed. | ||
# This is the migration check. | ||
# | ||
new_ip=$(resolve_ipv4 "$l_host") | ||
|
||
# If we fail to resolve the host name, try next time. | ||
if [ $? -ne 0 ]; then | ||
# | ||
# If account is deleted then we need to delete the MOUNTMAP entry along | ||
# with the proxy iptable entry created for that account. | ||
# Note that we don't delete if the MOUNTMAP was changed recently since | ||
# the account may have been re-created after the dns lookup failure. | ||
# | ||
if [ "$new_ip" == "NXDOMAIN" ]; then | ||
pecho "Account corresponding to $l_host seems to have been deleted, deleting from ${MOUNTMAP} [$line]!" | ||
|
||
l_mtime=$(ensure_mountmap_not_exist "$line" "$mtime_mountmap") | ||
[ $? -eq 0 ] && mtime_mountmap=$l_mtime | ||
else | ||
eecho "resolve_ipv4($l_host) failed: $new_ip" | ||
fi | ||
continue | ||
fi | ||
|
||
# | ||
# If the IP changed for the Blob endpoint, we need to update the DNAT rule. | ||
# This will take care of migration/failover causing the Blob endpoint IP to change. | ||
# | ||
if [ "$new_ip" != "$l_nfsip" ]; then | ||
pecho "IP for $l_host changed [$l_nfsip -> $new_ip]." | ||
|
||
# This will update DNAT rule as well. | ||
if ! update_mountmap_entry "$line" "$l_host $l_ip $new_ip"; then | ||
eecho "Will reattempt the operation in next iteration." | ||
fi | ||
fi | ||
done | ||
|
||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Oops, something went wrong.