Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(elasticsearch) Analytics indices creation on AWS ES #5502

Merged
merged 17 commits into from
Sep 29, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
181 changes: 110 additions & 71 deletions docker/elasticsearch-setup/create-indices.sh
100755 β†’ 100644
Original file line number Diff line number Diff line change
Expand Up @@ -6,98 +6,137 @@ set -e
: ${USE_AWS_ELASTICSEARCH:=false}
: ${ELASTICSEARCH_INSECURE:=false}

# protocol: http or https?
if [[ $ELASTICSEARCH_USE_SSL == true ]]; then
ELASTICSEARCH_PROTOCOL=https
else
ELASTICSEARCH_PROTOCOL=http
fi
echo -e "Going to use protocol: $ELASTICSEARCH_PROTOCOL"
echo -e "going to use protocol: $ELASTICSEARCH_PROTOCOL"

if [[ ! -z $ELASTICSEARCH_USERNAME ]] && [[ -z $ELASTICSEARCH_AUTH_HEADER ]]; then
AUTH_TOKEN=$(echo -ne "$ELASTICSEARCH_USERNAME:$ELASTICSEARCH_PASSWORD" | base64 --wrap 0)
ELASTICSEARCH_AUTH_HEADER="Authorization:Basic $AUTH_TOKEN"
fi
# Elasticsearch URL to be suffixed with a resource address
ELASTICSEARCH_URL="$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT"

# Add default header if needed
# set auth header if none is given
if [[ -z $ELASTICSEARCH_AUTH_HEADER ]]; then
echo -e "Going to use default elastic headers"
ELASTICSEARCH_AUTH_HEADER="Accept: */*"
if [[ ! -z $ELASTICSEARCH_USERNAME ]]; then
# no auth header given, but username is defined -> use it to create the auth header
AUTH_TOKEN=$(echo -ne "$ELASTICSEARCH_USERNAME:$ELASTICSEARCH_PASSWORD" | base64 --wrap 0)
ELASTICSEARCH_AUTH_HEADER="Authorization:Basic $AUTH_TOKEN"
echo -e "going to use elastic headers based on username and password"
else
# no auth header or username given -> use default auth header
ELASTICSEARCH_AUTH_HEADER="Accept: */*"
echo -e "going to use default elastic headers"
fi
fi

# will be using this for all curl communication with Elasticsearch:
CURL_ARGS=(
--silent
--header "$ELASTICSEARCH_AUTH_HEADER"
)
# ... also optionally use --insecure
if [[ $ELASTICSEARCH_INSECURE == true ]]; then
echo -e "Going to use default elastic insecure mode"
ELASTICSEARCH_INSECURE="-k "
else
unset ELASTICSEARCH_INSECURE
CURL_ARGS+=(--insecure)
fi

function create_datahub_usage_event_datastream() {
if [[ -z "$INDEX_PREFIX" ]]; then
PREFIX=''
else
PREFIX="${INDEX_PREFIX}_"
fi
echo -e "Create datahub_usage_event if needed against Elasticsearch at $ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT"
echo -e "Going to use index prefix:$PREFIX:"
POLICY_RESPONSE_CODE=$(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_ilm/policy/${PREFIX}datahub_usage_event_policy")
echo -e "Policy GET response code is $POLICY_RESPONSE_CODE"
POLICY_NAME="${PREFIX}datahub_usage_event_policy"
if [ $POLICY_RESPONSE_CODE -eq 404 ]; then
echo -e "\ncreating $POLICY_NAME"
sed -e "s/PREFIX/${PREFIX}/g" /index/usage-event/policy.json | tee -a /tmp/policy.json
curl -s -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_ilm/policy/$POLICY_NAME" --header "Content-Type: application/json" --data "@/tmp/policy.json"
elif [ $POLICY_RESPONSE_CODE -eq 200 ]; then
echo -e "\n${POLICY_NAME} exists"
elif [ $POLICY_RESPONSE_CODE -eq 403 ]; then
echo -e "Forbidden so exiting"
exit 1
else
echo -e "Got response code $POLICY_RESPONSE_CODE while creating policy so exiting."
exit 1
fi

TEMPLATE_RESPONSE_CODE=$(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_index_template/${PREFIX}datahub_usage_event_index_template")
echo -e "Template GET response code is $TEMPLATE_RESPONSE_CODE"
TEMPLATE_NAME="${PREFIX}datahub_usage_event_index_template"
if [ $TEMPLATE_RESPONSE_CODE -eq 404 ]; then
echo -e "\ncreating $TEMPLATE_NAME"
sed -e "s/PREFIX/${PREFIX}/g" /index/usage-event/index_template.json | tee -a /tmp/index_template.json
curl -s -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_index_template/$TEMPLATE_NAME" --header "Content-Type: application/json" --data "@/tmp/index_template.json"
elif [ $TEMPLATE_RESPONSE_CODE -eq 200 ]; then
echo -e "\n$TEMPLATE_NAME exists"
elif [ $TEMPLATE_RESPONSE_CODE -eq 403 ]; then
echo -e "Forbidden so exiting"
# index prefix used throughout the script
if [[ -z "$INDEX_PREFIX" ]]; then
PREFIX=''
echo -e "not using any prefix"
else
PREFIX="${INDEX_PREFIX}_"
echo -e "going to use prefix: '$PREFIX'"
fi

# path where index definitions are stored
INDEX_DEFINITIONS_ROOT=/index/usage-event
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thank you!



# check Elasticsearch for given index/resource (first argument)
# if it doesn't exist (http code 404), use the given file (second argument) to create it
function create_if_not_exists {
RESOURCE_ADDRESS="$1"
RESOURCE_DEFINITION_NAME="$2"

# query ES to see if the resource already exists
RESOURCE_STATUS=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}\n" "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS")
echo -e "\n>>> GET $RESOURCE_ADDRESS response code is $RESOURCE_STATUS"

if [ $RESOURCE_STATUS -eq 200 ]; then
# resource already exists -> nothing to do
echo -e ">>> $RESOURCE_ADDRESS already exists βœ“"

elif [ $RESOURCE_STATUS -eq 404 ]; then
# resource doesn't exist -> need to create it
echo -e ">>> creating $RESOURCE_ADDRESS because it doesn't exist ..."
# use the file at given path as definition, but first replace all occurences of `PREFIX`
# placeholder within the file with the actual prefix value
TMP_SOURCE_PATH="/tmp/$RESOURCE_DEFINITION_NAME"
sed -e "s/PREFIX/$PREFIX/g" "$INDEX_DEFINITIONS_ROOT/$RESOURCE_DEFINITION_NAME" | tee -a "$TMP_SOURCE_PATH"
curl "${CURL_ARGS[@]}" -XPUT "$ELASTICSEARCH_URL/$RESOURCE_ADDRESS" -H 'Content-Type: application/json' --data "@$TMP_SOURCE_PATH"

elif [ $RESOURCE_STATUS -eq 403 ]; then
# probably authorization fail
echo -e ">>> forbidden access to $RESOURCE_ADDRESS ! -> exiting"
exit 1

else
echo -e "Got response code $TEMPLATE_RESPONSE_CODE while creating template so exiting."
# when `USE_AWS_ELASTICSEARCH` was forgotten to be set to `true` when running against AWS ES OSS,
# this script will use wrong paths (e.g. `_ilm/policy/` instead of AWS-compatible `_opendistro/_ism/policies/`)
# and the ES endpoint will return `401 Unauthorized` or `405 Method Not Allowed`
# let's use this as chance to point that wrong config might be used!
if [ $RESOURCE_STATUS -eq 401 ] || [ $RESOURCE_STATUS -eq 405 ]; then
if [[ $USE_AWS_ELASTICSEARCH == false ]] && [[ $ELASTICSEARCH_URL == *"amazonaws"* ]]; then
echo "... looks like AWS OpenSearch is used; please set USE_AWS_ELASTICSEARCH env value to true"
fi
fi

echo -e ">>> failed to GET $RESOURCE_ADDRESS ! -> exiting"
exit 1
fi
}

# create indices for ES (non-AWS)
function create_datahub_usage_event_datastream() {
# non-AWS env requires creation of two resources for Datahub usage events:
# 1. ILM policy
create_if_not_exists "_ilm/policy/${PREFIX}datahub_usage_event_policy" policy.json
# 2. index template
create_if_not_exists "_index_template/${PREFIX}datahub_usage_event_index_template" index_template.json
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we not need a line like this here:

 create_if_not_exists "${PREFIX}datahub_usage_event" aws_es_usage_event.json

to ensure that the index is actually created for non aws cases?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because I do think there's a separate issue actively open where on fresh quickstarts without any usage events, the analytics index will be missing

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure how this works outside of AWS. The behavior in non-AWS environment should be the same as before refactoring.

Can you link the issue mentioned?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's this PR in question: #5974

}

# create indices for ES OSS (AWS)
function create_datahub_usage_event_aws_elasticsearch() {
if [[ -z "$INDEX_PREFIX" ]]; then
PREFIX=''
else
PREFIX="${INDEX_PREFIX}_"
fi
# AWS env requires creation of three resources for Datahub usage events:
# 1. ISM policy
create_if_not_exists "_opendistro/_ism/policies/${PREFIX}datahub_usage_event_policy" aws_es_ism_policy.json

if [ $(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_opendistro/_ism/policies/${PREFIX}datahub_usage_event_policy") -eq 404 ]
then
echo -e "\ncreating datahub_usage_event_policy"
sed -e "s/PREFIX/${PREFIX}/g" /index/usage-event/aws_es_ism_policy.json | tee -a /tmp/aws_es_ism_policy.json
curl -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_opendistro/_ism/policies/${PREFIX}datahub_usage_event_policy" -H 'Content-Type: application/json' --data @/tmp/aws_es_ism_policy.json
else
echo -e "\ndatahub_usage_event_policy exists"
fi
if [ $(curl -o /dev/null -s -w "%{http_code}" --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_template/${PREFIX}datahub_usage_event_index_template") -eq 404 ]
then
echo -e "\ncreating datahub_usage_event_index_template"
sed -e "s/PREFIX/${PREFIX}/g" /index/usage-event/aws_es_index_template.json | tee -a /tmp/aws_es_index_template.json
curl -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/_template/${PREFIX}datahub_usage_event_index_template" -H 'Content-Type: application/json' --data @/tmp/aws_es_index_template.json
curl -XPUT --header "$ELASTICSEARCH_AUTH_HEADER" "${ELASTICSEARCH_INSECURE}$ELASTICSEARCH_PROTOCOL://$ELASTICSEARCH_HOST:$ELASTICSEARCH_PORT/${PREFIX}datahub_usage_event-000001" -H 'Content-Type: application/json' --data "{\"aliases\":{\"${PREFIX}datahub_usage_event\":{\"is_write_index\":true}}}"
else
echo -e "\ndatahub_usage_event_index_template exists"
# 2. index template
create_if_not_exists "_template/${PREFIX}datahub_usage_event_index_template" aws_es_index_template.json

# 3. event index datahub_usage_event-000001
# (note that AWS *rollover* indices need to use `^.*-\d+$` naming pattern)
# -> https://aws.amazon.com/premiumsupport/knowledge-center/opensearch-failed-rollover-index/
INDEX_SUFFIX="000001"
# ... but first check whether `datahub_usage_event` wasn't already autocreated by GMS before `datahub_usage_event-000001`
# (as is common case when this script was initially run without properly setting `USE_AWS_ELASTICSEARCH` to `true`)
# -> https://github.com/datahub-project/datahub/issues/5376
USAGE_EVENT_STATUS=$(curl "${CURL_ARGS[@]}" -o /dev/null -w "%{http_code}\n" "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event")
if [ $USAGE_EVENT_STATUS -eq 200 ]; then
USAGE_EVENT_DEFINITION=$(curl "${CURL_ARGS[@]}" "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event")
# the definition is expected to contain "datahub_usage_event-000001" string
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is that?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Consider adding to comment here)

if [[ $USAGE_EVENT_DEFINITION != *"datahub_usage_event-$INDEX_SUFFIX"* ]]; then
# ... if it doesn't, we need to drop it
echo -e "\n>>> deleting invalid datahub_usage_event ..."
curl "${CURL_ARGS[@]}" -XDELETE "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event"
# ... and then recreate it below
fi
fi

# ... now we are safe to create the index
create_if_not_exists "${PREFIX}datahub_usage_event-$INDEX_SUFFIX" aws_es_index.json
}

if [[ $DATAHUB_ANALYTICS_ENABLED == true ]]; then
Expand All @@ -119,4 +158,4 @@ else
elif [ $DATAHUB_USAGE_EVENT_INDEX_RESPONSE_CODE -eq 403 ]; then
echo -e "Forbidden so exiting"
fi
fi
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"aliases": {
"PREFIXdatahub_usage_event": {
"is_write_index": true
}
}
}