Skip to content

Commit

Permalink
POC
Browse files Browse the repository at this point in the history
  • Loading branch information
cvvz committed Aug 2, 2023
1 parent f51f4fc commit d6d6de2
Show file tree
Hide file tree
Showing 8 changed files with 393 additions and 4 deletions.
7 changes: 7 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
FROM registry.k8s.io/build-image/debian-base:bullseye-v1.4.3

RUN apt update && apt upgrade -y && apt-mark unhold libcap2 && clean-install ca-certificates uuid-dev util-linux mount udev wget e2fsprogs nfs-common netbase procps conntrack iptables bind9-host iproute2 bash

COPY aznfswatchdog /usr/sbin/aznfswatchdog

CMD [ "aznfswatchdog" ]
358 changes: 358 additions & 0 deletions aznfswatchdog
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
#!/bin/bash

# --------------------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------

#
# How often does the watchdog look for unmounts and/or IP address changes for
# Blob endpoints.
#
MONITOR_INTERVAL_SECS=5

# How often do we check for change in FQDN->IP?
IP_CHANGE_DETECTION_FREQUENCY=60

#
# Remove unmounted entries only if MOUNTMAP has not been changed till MONITOR_INTERVAL_SECS seconds.
# Don't set it below 3 minutes.
#
MOUNTMAP_INACTIVITY_SECS=300

#
# Don't perform mountmap and iptables rule cleanup for unmounted filesystems.
# This can be set if you want lazy umount to work.
#
AZNFS_SKIP_UNMOUNT_CLEANUP="${AZNFS_SKIP_UNMOUNT_CLEANUP:-0}"

# TIMEWAIT timeout to be used for conntrack entries.
AZNFS_TIMEWAIT_TIMEOUT="${AZNFS_TIMEWAIT_TIMEOUT:-65}"

next_ip_change_detection_epoch=0

# Load common aznfs helpers.
. /opt/microsoft/aznfs/common.sh


# Create /opt/microsoft/aznfs/randbytes if not already created.
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then
dd if=/dev/urandom of=/opt/microsoft/aznfs/randbytes bs=256 count=1
fi
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then
uuidgen > /opt/microsoft/aznfs/randbytes
fi
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then
date | md5sum | awk '{print $1}' > /opt/microsoft/aznfs/randbytes
fi
if [ ! -s /opt/microsoft/aznfs/randbytes ]; then
date > /opt/microsoft/aznfs/randbytes
fi
chattr +i /opt/microsoft/aznfs/randbytes

#
# Hash for storing how many times we have seen a conntrack entry in SYN_SENT state.
# Used for finding if some entry is stuck in SYN_SENT state due to a bug in older
# kernels. If we find an entry stuck for more than a certain time in SYN_SENT state
# we delete the entry so that kernel looks up fresh NAT rules and creates a new entry.
#
declare -A cthash_synsent
declare -A cthash_synsent_now

reconcile_conntrack_one()
{
local l_ip=$1
local l_sport=$2
local l_dport=$3
local l_nfsip=$4

key="${l_ip}:${l_sport}:${l_dport}:${l_nfsip}"
let cthash_synsent[$key]++
let cthash_synsent_now[$key]++

#
# We are called every 5 secs, so this deletes an entry stuck in
# SYN_SENT for 25/30 secs.
#
if [ ${cthash_synsent[$key]} -ge 5 ]; then
cmd="conntrack -D -p tcp -d $l_ip -r $l_nfsip --sport $l_sport --dport $l_dport"
wecho "Deleting conntrack entry stuck in SYN_SENT state [$cmd]"

eval $cmd
if [ $? -ne 0 ]; then
eecho "Failed to delete conntrack entry [$cmd]!"
else
unset cthash_synsent[$key]
fi
fi
}

reconcile_conntrack()
{
local l_ip=$1
local l_nfsip=$2

# cthash_synsent_now holds only entries found in this call, so clear it before starting.
unset cthash_synsent_now
declare -A cthash_synsent_now

#
# For mounts with nconnect, there could be more than one conntrack entries to the same
# proxy IP, but with different local ports. We must track them separately.
#
IFS=$'\n' output111=$(conntrack -L -p tcp -d $l_ip -r $l_nfsip --dport 111 --state SYN_SENT 2>/dev/null)
IFS=$'\n' output2048=$(conntrack -L -p tcp -d $l_ip -r $l_nfsip --dport 2048 --state SYN_SENT 2>/dev/null)
output="$output111"$'\n'"$output2048"

if [ -n "$output" ]; then
for line in $output; do
# XXX Remove this log after running for few days.
vecho "$line"

matchstr=".* SYN_SENT src=.* dst=$l_ip sport=([0-9]+) dport=([0-9]+).*"
if [[ "$line" =~ $matchstr ]]; then
l_sport=${BASH_REMATCH[1]}
l_dport=${BASH_REMATCH[2]}
reconcile_conntrack_one $l_ip $l_sport $l_dport $l_nfsip
fi
done
fi

#
# Any port that is not stuck now, means it's recovered since the last time and can be removed
# from the global cthash_synsent map.
#
for key in "${!cthash_synsent[@]}"; do
if [ ! -v cthash_synsent_now[$key] ]; then
unset cthash_synsent[$key]
fi
done
}

vecho "Starting aznfswatchdog..."

# SIGTERM handler.
sigterm_handler()
{
wecho "SIGTERM received, exiting..."
exit 0
}
trap sigterm_handler SIGTERM

# Dump NAT table once on startup in case we have reported conflicts.
vecho "NAT table:\n$(iptables-save -t nat)"
conntrack -L > /dev/null

# conntrack timewait timeout higher than the TCP timewait timeout value isn't very valuable.
conntrack_timeo_timew=$(cat /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait 2>/dev/null)
if [ $? -eq 0 -a $conntrack_timeo_timew -gt $AZNFS_TIMEWAIT_TIMEOUT ]; then
vecho "Changing /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait [$conntrack_timeo_timew -> $AZNFS_TIMEWAIT_TIMEOUT]"
echo $AZNFS_TIMEWAIT_TIMEOUT > /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_time_wait
fi

if ! chattr -f +i $MOUNTMAP; then
wecho "chattr does not work for ${MOUNTMAP}!"
fi



#
# Watchdog for monitoring unmounts and more importantly change in blob endpoint
# addresses possibly as a result of migration.
#
while :; do
sleep $MONITOR_INTERVAL_SECS

#
# TODO: Add a function reconcile_mountmap() and call it from here. This
# should reconstruct the MOUNTMAP file from findmnt and output of
# iptables. This will be added in subsequent revisions.
#

epoch_now=$(date +%s)

#
# Go over all lines in $MOUNTMAP and check them for two things:
# 1. Is that entry still in use by at least one aznfs mount, if not remove the entry.
# 2. Has the Blob endpoint address changed from what is stored?
# If yes, update DNAT rule to point to the new address and update entry accordingly.
#
# Sample line in $MOUNTMAP.
# account.blob.preprod.core.windows.net 10.100.100.100 52.230.170.200
#
# where the format is
# blobendpoint_fqdn proxy_ip blobendpoint_ip
#
# We store the mtime of MOUNTMAP while inside the lock so that if any mount helper process
# updates it after this we will skip modification for sake of safety. We will come to it
# in the next iteration when it's safer.
#
exec {fd}<$MOUNTMAP
flock -e $fd
mtime_mountmap=$(stat -c%Y $MOUNTMAP)
IFS=$'\n' lines=$(cat $MOUNTMAP)
flock -u $fd
exec {fd}<&-

do_ip_change_detection=false
if [ $epoch_now -ge $next_ip_change_detection_epoch ]; then
do_ip_change_detection=true
next_ip_change_detection_epoch=$(expr $(date +%s) + $IP_CHANGE_DETECTION_FREQUENCY)
fi

#
# Do unmount GC only if MOUNTMAP file is not modified in the last
# MOUNTMAP_INACTIVITY_SECS seconds. We don't want to incorrectly delete an
# entry while some aznfs mount is ongoing.
#
do_unmount_gc=false
if [ "$AZNFS_SKIP_UNMOUNT_CLEANUP" == "0" ]; then
if [ $epoch_now -ge $(expr $mtime_mountmap + $MOUNTMAP_INACTIVITY_SECS) ]; then
do_unmount_gc=true
fi
fi

#
# findmnt must be done after reading MOUNTMAP so that if we come accross a
# MOUNTMAP entry whose proxy_ip is not used by any existing mount, we know
# for sure that it's not in use by any mount and can be removed.
#
findmnt=$(findmnt --raw --noheading -o MAJ:MIN,FSTYPE,SOURCE,TARGET,OPTIONS -t nfs 2>&1)

#
# For no matching mounts also, findmnt exits with a failure return, so check
# for both exit status and non-empty error o/p.
#
if [ $? -ne 0 -a -n "$findmnt" ]; then
eecho "${findmnt}."
eecho "[FATAL] findmnt failed unexpectedly!"
eecho "[FATAL] Aznfswatchdog service is exiting, will not monitor Azure NFS shares."
eecho "[FATAL] Please contact Microsoft support before using any Blob NFS shares."
# This usually indicates some non-transient issue, bail out.
exit 1
fi

for line in $lines; do
if [ -z "$line" ]; then
continue
fi

#
# MOUNTMAP line is of the form:
# account.blob.preprod.core.windows.net <local ip> <public ip> [<PID>]
#
IFS=" " read l_host l_ip l_nfsip <<< "$line"

if [ -z "$l_host" -o -z "$l_ip" -o -z "$l_nfsip" ]; then
wecho "[FATAL] Deleting invalid line in $MOUNTMAP: [$line]!"
l_mtime=$(ensure_mountmap_not_exist "$line")
[ $? -eq 0 ] && mtime_mountmap=$l_mtime
continue
fi

# Since we added it to the MOUNTMAP file, it cannot be invalid.
if ! is_private_ip "$l_ip"; then
wecho "[FATAL] local ip ($l_ip) is invalid!"
l_mtime=$(ensure_mountmap_not_exist "$line")
[ $? -eq 0 ] && mtime_mountmap=$l_mtime
continue
fi

# Since we added it to the MOUNTMAP file, it cannot be invalid.
if ! is_valid_ipv4_address "$l_nfsip"; then
wecho "[FATAL] Blob endpoint ip ($l_nfsip) is invalid!"
l_mtime=$(ensure_mountmap_not_exist "$line")
[ $? -eq 0 ] && mtime_mountmap=$l_mtime
continue
fi

#
# Delete entry from MOUNTMAP if there are no mounted shares on that host.
# As long as we have at least one mount using the MOUNTMAP entry, we leave
# it around.
#
if ! echo "$findmnt" | grep " nfs ${l_ip}:" >/dev/null; then
if $do_unmount_gc; then
pecho "No mounted shares for host $l_host, deleting from ${MOUNTMAP} [$line]."

# Delete IFF mountmap is not changed since we read it above.
l_mtime=$(ensure_mountmap_not_exist "$line" "$mtime_mountmap")

#
# Update ifmatch time in case of successful updation of MOUNTMAP,
# so that we can distinguish between MOUNTMAP mtime changing because
# of our action or some mount helper changing it. In the former case
# it's safe to update the MOUNTMAP, so update mtime_mountmap to the
# mtime after this update.
#
[ $? -eq 0 ] && mtime_mountmap=$l_mtime
continue
fi
else
#
# Verify that iptable entry should be present for corresponding
# MOUNTMAP entry if the share is not unmounted.
#
# Note: This is extra protection in case user flushes the iptable
# entries or removes it by mistake. This should not be
# required normally.
#
# We also reconcile conntrack entries stuck in some bad states which
# may hamper communication, f.e., in older kernels there's a bug due to
# which conntrack entry may get stuck in SYN_SENT state if client
# reuse the source port and keep retransmitting SYNs before the entry
# can timeout.
#
reconcile_conntrack "$l_ip" "$l_nfsip"
verify_iptable_entry "$l_ip" "$l_nfsip"

fi

#
# We do IP change detection less frequently than unmount detection
# since it will cause DNS calls on network.
#
if ! $do_ip_change_detection; then
continue
fi

#
# Check if blob endpoint IP address changed.
# This is the migration check.
#
new_ip=$(resolve_ipv4 "$l_host")

# If we fail to resolve the host name, try next time.
if [ $? -ne 0 ]; then
#
# If account is deleted then we need to delete the MOUNTMAP entry along
# with the proxy iptable entry created for that account.
# Note that we don't delete if the MOUNTMAP was changed recently since
# the account may have been re-created after the dns lookup failure.
#
if [ "$new_ip" == "NXDOMAIN" ]; then
pecho "Account corresponding to $l_host seems to have been deleted, deleting from ${MOUNTMAP} [$line]!"

l_mtime=$(ensure_mountmap_not_exist "$line" "$mtime_mountmap")
[ $? -eq 0 ] && mtime_mountmap=$l_mtime
else
eecho "resolve_ipv4($l_host) failed: $new_ip"
fi
continue
fi

#
# If the IP changed for the Blob endpoint, we need to update the DNAT rule.
# This will take care of migration/failover causing the Blob endpoint IP to change.
#
if [ "$new_ip" != "$l_nfsip" ]; then
pecho "IP for $l_host changed [$l_nfsip -> $new_ip]."

# This will update DNAT rule as well.
if ! update_mountmap_entry "$line" "$l_host $l_ip $new_ip"; then
eecho "Will reattempt the operation in next iteration."
fi
fi
done

done
16 changes: 16 additions & 0 deletions charts/latest/blob-csi-driver/templates/csi-blob-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,20 @@ spec:
mountPath: /etc/pki/ca-trust/extracted
readOnly: true
{{- end }}
- mountPath: /opt/microsoft/aznfs/
name: aznfs
resources: {{- toYaml .Values.node.resources.blob | nindent 12 }}
- name: aznfswatchdog
image: cvvz/aznfswatchdog:latest
imagePullPolicy: Always
securityContext:
privileged: true
volumeMounts:
- mountPath: /opt/microsoft/aznfs/
name: aznfs
- mountPath: {{ .Values.linux.kubelet }}/
mountPropagation: Bidirectional
name: mountpoint-dir
volumes:
{{- if .Values.node.enableBlobfuseProxy }}
- name: host-usr
Expand Down Expand Up @@ -280,6 +293,9 @@ spec:
hostPath:
path: /etc/pki/ca-trust/extracted
{{- end }}
- hostPath:
path: /opt/microsoft/aznfs/
name: aznfs
{{- if .Values.securityContext }}
securityContext: {{- toYaml .Values.securityContext | nindent 8 }}
{{- end }}
Binary file added mount.aznfs
Binary file not shown.
Loading

0 comments on commit d6d6de2

Please sign in to comment.