-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsnapshot-verify.sh
120 lines (106 loc) · 4.87 KB
/
snapshot-verify.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/bin/bash
export PATH=$PATH:/usr/local/bin:/usr/bin
# Safety feature: exit script if error is returned, or if variables not set.
# Exit if a pipeline results in an error.
set -ue
set -o pipefail
# Set AWS account
aws_profile="AWS_Profile_Name"
failing_volumes=()
# Get an array of servers
#servers=($(aws ec2 describe-instances --profile $aws_profile --output table | grep InstanceId | awk '{ print $4 }'))
servers=($(aws ec2 describe-instances --profile $aws_profile --output text --query Reservations[].Instances[].InstanceId))
## Putting the FUN in Functions ##
# Setup logfile and redirect stdout/stderr.
log_setup() {
# Check if logfile exists and is writable.
( [ -e "$logfile" ] || touch "$logfile" ) && [ ! -w "$logfile" ] && echo "ERROR: Cannot write to $logfile. Check permissions or sudo access." && exit 1
tmplog=$(tail -n $logfile_max_lines $logfile 2>/dev/null) && echo "${tmplog}" > $logfile
exec > >(tee -a $logfile)
exec 2>&1
}
# Log an event.
log() {
echo "[$(date +"%Y-%m-%d"+"%T")]: $*"
}
# Verify the correct number of snapshots
verify_snapshots() {
for volume_id in $1; do
snapshot_list=($(aws ec2 describe-snapshots --profile $aws_profile --region $region --output=text --filters Name=volume-id,Values=$volume_id Name=tag-key,Values="CreatedBy" Name=tag-value,Values="AutomatedBackup" --query Snapshots[].SnapshotId))
retention_days_in_seconds=$(date +%s --date "$3 days ago")
printf "\tRetention Days: %s\n" "$3"
if [ -z "$snapshot_list" ]
then
printf "\tNode doesn't have any Autobackup snapshots!\n\n"
continue
fi
if [ "${#snapshot_list[@]}" -ge "$node_days" ]
then
printf "\tNode has the correct number of snapshots, checking next server (if one exists)\n\n"
else
printf "\tNode does not have the correct number of snapshots - checking to see if it's a new node...\n"
volume_attach_date=$(aws ec2 describe-volumes --profile $aws_profile --output text --volume-ids $volume_id --query Volumes[].CreateTime | cut -d. -f1)
threashold_date=$(date +"%Y-%m-%dT%H:%M:%S" --date "$3 days ago")
volume_attach_date_seconds=$(date +%s --date "$volume_attach_date days ago")
printf "\t\tVolume Creation Date:\t\t%s\n\t\tThreashold Date:\t\t%s\n" "$volume_attach_date" "$threashold_date"
if [ "$volume_attach_date_seconds" -lt "$retention_days_in_seconds" ]
then
printf "\t\tVolume is failing snapshot threashold!\n\n"
failing_volumes+=($volume_id)
else
printf "\t\tVolume is newer than threashold - no big deal\n\t\tVolume was created on %s\n\n" "$volume_attach_date"
fi
fi
done
}
# Gather an Array of all the volumes attached to the node
volume_attachments() {
# Grab the region and then remove the AZ with sed
region=$(aws ec2 describe-instances --profile $aws_profile --instance-ids $1 --output table | grep "Availability" | awk '{ print $4 }' | sed -e 's/\([1-9]\).$/\1/g')
# Grab a sweet sweet list of volumes and print it out
volume_list=$(aws ec2 describe-volumes --profile $aws_profile --region $region --filters Name=attachment.instance-id,Values=$1 --query Volumes[].VolumeId --output text)
printf "\tInstance has the following volume(s) attached\n\t\t%s\n" "$volume_list"
# Need to continue to pass the information from the first loop
verify_snapshots $volume_list $1 $2
}
# Find the correct number of retention days for the nodes and then do some other shit.
# Need to call from this function because we want to keep it neat and organized
retention_days() {
for index in "${!servers[@]}"
do
printf "Checking Retention Policy on Instance: %s\n" "${servers[index]}"
# Lets query the chef server by instance ID and get the chef server's Node Name
node_name=$(chef exec knife exec -E "nodes.find(:ec2_instance_id=> '${servers[index]}') { |node| puts node.name }")
# Now that we have the Node Name, we can list the attribute and print out it's value
# We have to check for null char because chef is stupid and prints silly format
if [ -z "$node_name" ]
then
printf "\tThis server is not in chef! Breaking"
continue
fi
node_days=$(chef exec knife node show $node_name -a ebs_snapshots.retention_days | awk '{ if($2!="") { print $2 } }')
# If there's nothing set - we need to error out so we can add that manually.
if [ -z "$node_days" ]
then
printf "\tNode does not have retention days set! Erroring out after checking other servers\n"
else
printf "\tInstance is keeping snapshots for %s days.\n" "$node_days"
fi
# Pass the server ID into the function to get it's volume info
volume_attachments ${servers[index]} $node_days
done
}
check_errors() {
if [ -z "${failing_volumes[@]}" ]
then
printf "\nAll Good"
else
printf "\n\nFailing Volumes:\n"
printf "%s\n" "${failing_volumes[@]}"
exit 1
fi
}
# Testing
# Run Functions
retention_days
check_errors