Skip to content

Commit

Permalink
feat: add async reindexing for large dataset (#94)
Browse files Browse the repository at this point in the history
* feat: add async reindexing for large dataset

* fix: respect wait_for_completion param
  • Loading branch information
lakhansamani authored Mar 27, 2020
1 parent fac6e6b commit b4e0dac
Show file tree
Hide file tree
Showing 3 changed files with 152 additions and 34 deletions.
121 changes: 93 additions & 28 deletions plugins/reindexer/dao.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ package reindexer
import (
"context"
"encoding/json"
"errors"
"fmt"
"regexp"
"time"

log "github.com/sirupsen/logrus"

Expand All @@ -13,6 +15,35 @@ import (
es7 "github.com/olivere/elastic/v7"
)

func postReIndex(ctx context.Context, sourceIndex, newIndexName string) error {
// Fetch all the aliases of old index
alias, err := aliasesOf(ctx, sourceIndex)

var aliases = []string{}
if err != nil {
return errors.New(`error fetching aliases of index ` + sourceIndex + "\n" + err.Error())
}

if alias == "" {
aliases = append(aliases, sourceIndex)
} else {
aliases = append(aliases, alias)
}

// Delete old index
err = deleteIndex(ctx, sourceIndex)
if err != nil {
return errors.New(`error deleting source index ` + sourceIndex + "\n" + err.Error())
}
// Set aliases of old index to the new index.
err = setAlias(ctx, newIndexName, aliases...)
if err != nil {
return errors.New(`error setting alias for ` + newIndexName + "\n" + err.Error())
}

return nil
}

// Reindex Inplace: https://www.elastic.co/guide/en/elasticsearch/reference/current/reindex-upgrade-inplace.html
//
// 1. Create a new index and copy the mappings and settings from the old index.
Expand Down Expand Up @@ -116,53 +147,31 @@ func reindex(ctx context.Context, sourceIndex string, config *reindexConfig, wai
Destination(dest).
WaitForCompletion(waitForCompletion)

// If wait_for_completion = true, then we carry out the task synchronously along with three more steps:
// - fetch any aliases of the old index
// - delete the old index
// - set the aliases of the old index to the new index
if waitForCompletion {
response, err := reindex.Do(ctx)
if err != nil {
return nil, err
}

if destinationIndex == "" {
// Fetch all the aliases of old index
alias, err := aliasesOf(ctx, sourceIndex)

var aliases = []string{}
err = postReIndex(ctx, sourceIndex, newIndexName)
if err != nil {
return nil, fmt.Errorf(`error fetching aliases of index "%s": %v`, sourceIndex, err)
}

if alias == "" {
aliases = append(aliases, sourceIndex)
} else {
aliases = append(aliases, alias)
}

// Delete old index
err = deleteIndex(ctx, sourceIndex)
if err != nil {
return nil, fmt.Errorf(`error deleting index "%s": %v\n`, sourceIndex, err)
}
// Set aliases of old index to the new index.
err = setAlias(ctx, newIndexName, aliases...)
if err != nil {
return nil, fmt.Errorf(`error setting alias "%s" for index "%s"`, sourceIndex, newIndexName)
return nil, err
}
}

return json.Marshal(response)
}

// If wait_for_completion = false, we carry out the reindexing asynchronously and return the task ID.
// If wait_for_completion = false, we carry out the re-indexing asynchronously and return the task ID.
log.Println(logTag, fmt.Sprintf(" Data is > %d so using async reindex", IndexStoreSize))
response, err := reindex.DoAsync(context.Background())
if err != nil {
return nil, err
}
taskID := response.TaskId

go asyncReIndex(taskID, sourceIndex, newIndexName)

// Get the reindex task by ID
task, err := util.GetClient7().TasksGetTask().TaskId(taskID).Do(context.Background())
if err != nil {
Expand Down Expand Up @@ -383,3 +392,59 @@ func getAliasIndexMap(ctx context.Context) (map[string]string, error) {

return res, nil
}

func getIndexSize(ctx context.Context, indexName string) (int64, error) {
var res int64
index := classify.GetAliasIndex(indexName)
if index == "" {
index = indexName
}
stats, err := util.GetClient7().IndexStats(indexName).Do(ctx)
if err != nil {
return res, err
}
res = stats.Indices[index].Primaries.Store.SizeInBytes
return res, nil
}

func isTaskCompleted(ctx context.Context, taskID string) (bool, error) {
res := false

status, err := util.GetClient7().TasksGetTask().TaskId(taskID).Do(ctx)
if err != nil {
log.Errorln(logTag, " Get task status error", err)
return res, err
}

res = status.Completed
return res, nil
}

// go routine to track async re-indexing process for a given source and destination index.
// it checks every 30s if task is completed or not.
func asyncReIndex(taskID, source, destination string) {
SetCurrentProcess(taskID, source, destination)
isCompleted := make(chan bool, 1)
ticker := time.Tick(30 * time.Second)
ctx := context.Background()

for {
select {
case <-ticker:
ok, _ := isTaskCompleted(ctx, taskID)
log.Println(logTag, " "+taskID+" task is still re-indexing data...")
if ok {
isCompleted <- true
}
case <-isCompleted:
log.Println(logTag, taskID+" task completed successfully")
// remove process from current cache
RemoveCurrentProcess(taskID)
err := postReIndex(ctx, source, destination)
if err != nil {
log.Errorln(logTag, " post re-indexing error: ", err)
}
return
}
}
}
27 changes: 21 additions & 6 deletions plugins/reindexer/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package reindexer

import (
"encoding/json"
"fmt"
"io/ioutil"
"net/http"
"strconv"
Expand All @@ -28,8 +29,11 @@ func (rx *reindexer) reindex() http.HandlerFunc {
if checkVar(ok, w, "index") {
return
}

err, body, waitForCompletion, done := reindexConfigResponse(req, w)
if IsReIndexInProcess(indexName, "") {
util.WriteBackError(w, fmt.Sprintf(`Re-indexing is already in progress for %s index`, indexName), http.StatusInternalServerError)
return
}
err, body, waitForCompletion, done := reindexConfigResponse(req, w, indexName)
if done {
return
}
Expand All @@ -50,7 +54,7 @@ func (rx *reindexer) reindexSrcToDest() http.HandlerFunc {
if checkVar(okD, w, "destination_index") {
return
}
err, body, waitForCompletion, done := reindexConfigResponse(req, w)
err, body, waitForCompletion, done := reindexConfigResponse(req, w, sourceIndex)
if done {
return
}
Expand Down Expand Up @@ -91,7 +95,7 @@ func checkVar(okS bool, w http.ResponseWriter, variable string) bool {
return false
}

func reindexConfigResponse(req *http.Request, w http.ResponseWriter) (error, reindexConfig, bool, bool) {
func reindexConfigResponse(req *http.Request, w http.ResponseWriter, sourceIndex string) (error, reindexConfig, bool, bool) {
reqBody, err := ioutil.ReadAll(req.Body)
if err != nil {
log.Errorln(logTag, ":", err)
Expand All @@ -108,10 +112,21 @@ func reindexConfigResponse(req *http.Request, w http.ResponseWriter) (error, rei
return nil, reindexConfig{}, false, true
}

// By default, wait_for_completion = true
// By default, wait_for_completion depends on size of index
param := req.URL.Query().Get("wait_for_completion")
if param == "" {
param = "true"
// Get the size of currentIndex, if that is > IndexStoreSize (5MB - 5000000 Bytes) then do async re-indexing.
size, err := getIndexSize(req.Context(), sourceIndex)
if err != nil {
log.Errorln(logTag, ":", err)
util.WriteBackError(w, "Unable to get the size of "+sourceIndex, http.StatusBadRequest)
return nil, reindexConfig{}, false, true
}
if size > IndexStoreSize {
param = "false"
} else {
param = "true"
}
}
waitForCompletion, err := strconv.ParseBool(param)
if err != nil {
Expand Down
38 changes: 38 additions & 0 deletions plugins/reindexer/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"regexp"
"strconv"
"strings"
"sync"

"github.com/appbaseio/arc/middleware/classify"
log "github.com/sirupsen/logrus"
Expand All @@ -26,6 +27,15 @@ type AliasedIndices struct {
PriStoreSize string `json:"pri.store.size"`
}

// CurrentlyReIndexingProcess map of taskID [source, destinations] indexes for which indexing process is going on
var CurrentlyReIndexingProcess = make(map[string][]string)

// CurrentlyReIndexingProcessMutex to stop concurrent writes on map
var CurrentlyReIndexingProcessMutex = sync.RWMutex{}

// IndexStoreSize to decide whether to use async or sync re-indexing
const IndexStoreSize = int64(5000000)

// reindexedName calculates from the name the number of times an index has been
// reindexed to generate the successive name for the index. For example: for an
// index named "twitter", the funtion returns "twitter_reindexed_1", and for an
Expand Down Expand Up @@ -76,3 +86,31 @@ func InitAliasIndexCache() {
classify.SetAliasIndexCache(aliasIndexMap)
log.Println(logTag, "=> Alias Index Cache", classify.GetAliasIndexCache())
}

// SetCurrentProcess set indexes for current reindexing process
func SetCurrentProcess(taskID, source, destination string) {
CurrentlyReIndexingProcessMutex.Lock()
CurrentlyReIndexingProcess[taskID] = []string{source, destination}
CurrentlyReIndexingProcessMutex.Unlock()
}

// RemoveCurrentProcess remove indexes for current reindexing process
func RemoveCurrentProcess(taskID string) {
CurrentlyReIndexingProcessMutex.Lock()
delete(CurrentlyReIndexingProcess, taskID)
CurrentlyReIndexingProcessMutex.Unlock()
}

// IsReIndexInProcess check if index is Processing currently
func IsReIndexInProcess(source, destination string) bool {
for _, processingIndexes := range CurrentlyReIndexingProcess {
if processingIndexes[0] == source || processingIndexes[0] == destination {
return true
}
if processingIndexes[1] == source || processingIndexes[1] == destination {
return true
}
}

return false
}

0 comments on commit b4e0dac

Please sign in to comment.