Skip to content
This repository has been archived by the owner on Apr 12, 2019. It is now read-only.

Faster commit lookup #91

Merged
merged 4 commits into from
Dec 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
307 changes: 307 additions & 0 deletions commit_info.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
// Copyright 2017 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

package git

import (
"bufio"
"context"
"fmt"
"os/exec"
"path"
"runtime"
"strconv"
"strings"
"sync"
"time"
)

const (
// parameters for searching for commit infos. If the untargeted search has
// not found any entries in the past 5 commits, and 12 or fewer entries
// remain, then we'll just let the targeted-searching threads finish off,
// and stop the untargeted search to not interfere.
deferToTargetedSearchColdStreak = 5
deferToTargetedSearchNumRemainingEntries = 12
)

// getCommitsInfoState shared state while getting commit info for entries
type getCommitsInfoState struct {
lock sync.Mutex
/* read-only fields, can be read without the mutex */
// entries and entryPaths are read-only after initialization, so they can
// safely be read without the mutex
entries []*TreeEntry
// set of filepaths to get info for
entryPaths map[string]struct{}
treePath string
headCommit *Commit

/* mutable fields, must hold mutex to read or write */
// map from filepath to commit
commits map[string]*Commit
// set of filepaths that have been or are being searched for in a target search
targetedPaths map[string]struct{}
}

func (state *getCommitsInfoState) numRemainingEntries() int {
state.lock.Lock()
defer state.lock.Unlock()
return len(state.entries) - len(state.commits)
}

// getTargetEntryPath Returns the next path for a targeted-searching thread to
// search for, or returns the empty string if nothing left to search for
func (state *getCommitsInfoState) getTargetedEntryPath() string {
var targetedEntryPath string
state.lock.Lock()
defer state.lock.Unlock()
for _, entry := range state.entries {
entryPath := path.Join(state.treePath, entry.Name())
if _, ok := state.commits[entryPath]; ok {
continue
} else if _, ok = state.targetedPaths[entryPath]; ok {
continue
}
targetedEntryPath = entryPath
state.targetedPaths[entryPath] = struct{}{}
break
}
return targetedEntryPath
}

// repeatedly perform targeted searches for unpopulated entries
func targetedSearch(state *getCommitsInfoState, done chan error) {
for {
entryPath := state.getTargetedEntryPath()
if len(entryPath) == 0 {
done <- nil
return
}
command := NewCommand("rev-list", "-1", "HEAD", "--", entryPath)
output, err := command.RunInDir(state.headCommit.repo.Path)
if err != nil {
done <- err
return
}
id, err := NewIDFromString(strings.TrimSpace(output))
if err != nil {
done <- err
return
}
commit, err := state.headCommit.repo.getCommit(id)
if err != nil {
done <- err
return
}
state.update(entryPath, commit)
}
}

func initGetCommitInfoState(entries Entries, headCommit *Commit, treePath string) *getCommitsInfoState {
entryPaths := make(map[string]struct{}, len(entries))
for _, entry := range entries {
entryPaths[path.Join(treePath, entry.Name())] = struct{}{}
}
if treePath = path.Clean(treePath); treePath == "." {
treePath = ""
}
return &getCommitsInfoState{
entries: entries,
entryPaths: entryPaths,
commits: make(map[string]*Commit, len(entries)),
targetedPaths: make(map[string]struct{}, len(entries)),
treePath: treePath,
headCommit: headCommit,
}
}

// GetCommitsInfo gets information of all commits that are corresponding to these entries
func (tes Entries) GetCommitsInfo(commit *Commit, treePath string) ([][]interface{}, error) {
state := initGetCommitInfoState(tes, commit, treePath)
if err := getCommitsInfo(state); err != nil {
return nil, err
}
if len(state.commits) < len(state.entryPaths) {
return nil, fmt.Errorf("could not find commits for all entries")
}

commitsInfo := make([][]interface{}, len(tes))
for i, entry := range tes {
commit, ok := state.commits[path.Join(treePath, entry.Name())]
if !ok {
return nil, fmt.Errorf("could not find commit for %s", entry.Name())
}
switch entry.Type {
case ObjectCommit:
subModuleURL := ""
if subModule, err := state.headCommit.GetSubModule(entry.Name()); err != nil {
return nil, err
} else if subModule != nil {
subModuleURL = subModule.URL
}
subModuleFile := NewSubModuleFile(commit, subModuleURL, entry.ID.String())
commitsInfo[i] = []interface{}{entry, subModuleFile}
default:
commitsInfo[i] = []interface{}{entry, commit}
}
}
return commitsInfo, nil
}

func (state *getCommitsInfoState) cleanEntryPath(rawEntryPath string) (string, error) {
if rawEntryPath[0] == '"' {
var err error
rawEntryPath, err = strconv.Unquote(rawEntryPath)
if err != nil {
return rawEntryPath, err
}
}
var entryNameStartIndex int
if len(state.treePath) > 0 {
entryNameStartIndex = len(state.treePath) + 1
}

if index := strings.IndexByte(rawEntryPath[entryNameStartIndex:], '/'); index >= 0 {
return rawEntryPath[:entryNameStartIndex+index], nil
}
return rawEntryPath, nil
}

// update report that the given path was last modified by the given commit.
// Returns whether state.commits was updated
func (state *getCommitsInfoState) update(entryPath string, commit *Commit) bool {
if _, ok := state.entryPaths[entryPath]; !ok {
return false
}

var updated bool
state.lock.Lock()
defer state.lock.Unlock()
if _, ok := state.commits[entryPath]; !ok {
state.commits[entryPath] = commit
updated = true
}
return updated
}

const getCommitsInfoPretty = "--pretty=format:%H %ct %s"

func getCommitsInfo(state *getCommitsInfoState) error {
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Minute)
defer cancel()

args := []string{"log", getCommitsInfoPretty, "--name-status", "-c"}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this command ever be faster than the targetedSearch ?

Copy link
Member Author

@ethantkoenig ethantkoenig Nov 23, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it is faster for repos with many files (which would otherwise require many calls to targetedSearch vs. one untargeted search). See the benchmark results for the manyfiles repo.

Copy link
Member

@bkcsoft bkcsoft Nov 23, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe have 2 benchmarks, one for each type of search, just to convince me 😛

Copy link
Member Author

@ethantkoenig ethantkoenig Nov 23, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bkcsoft I believe the existing benchmarks are telling on their own. Using only the targetedSearch is essentially the same as the old implementation (aside from a few small tweaks). The only explanation for the very large (>150x) performance improvement in the manyfiles repo between the old and new implementations is that running an "untargeted" search in parallel is helpful (in this case, very helpful).

The advantage of using a "untargeted" approach in repos with many files is also reflected in the benchmarks for #53 (which solely used an untargeted approach)

if len(state.treePath) > 0 {
args = append(args, "--", state.treePath)
}
cmd := exec.CommandContext(ctx, "git", args...)
cmd.Dir = state.headCommit.repo.Path

readCloser, err := cmd.StdoutPipe()
if err != nil {
return err
}

if err := cmd.Start(); err != nil {
return err
}

numThreads := runtime.NumCPU()
done := make(chan error, numThreads)
for i := 0; i < numThreads; i++ {
go targetedSearch(state, done)
}

scanner := bufio.NewScanner(readCloser)
err = state.processGitLogOutput(scanner)
for i := 0; i < numThreads; i++ {
doneErr := <-done
if doneErr != nil && err == nil {
err = doneErr
}
}
return err
}

func (state *getCommitsInfoState) processGitLogOutput(scanner *bufio.Scanner) error {
// keep a local cache of seen paths to avoid acquiring a lock for paths
// we've already seen
seenPaths := make(map[string]struct{}, len(state.entryPaths))
// number of consecutive commits without any finds
coldStreak := 0
var commit *Commit
var err error
for scanner.Scan() {
line := scanner.Text()
if len(line) == 0 { // in-between commits
numRemainingEntries := state.numRemainingEntries()
if numRemainingEntries == 0 {
break
}
if coldStreak >= deferToTargetedSearchColdStreak &&
numRemainingEntries <= deferToTargetedSearchNumRemainingEntries {
// stop this untargeted search, and let the targeted-search threads
// finish the work
break
}
continue
}
if line[0] >= 'A' && line[0] <= 'X' { // a file was changed by the current commit
// look for the last tab, since for copies (C) and renames (R) two
// filenames are printed: src, then dest
tabIndex := strings.LastIndexByte(line, '\t')
if tabIndex < 1 {
return fmt.Errorf("misformatted line: %s", line)
}
entryPath, err := state.cleanEntryPath(line[tabIndex+1:])
if err != nil {
return err
}
if _, ok := seenPaths[entryPath]; !ok {
if state.update(entryPath, commit) {
coldStreak = 0
}
seenPaths[entryPath] = struct{}{}
}
continue
}

// a new commit
commit, err = parseCommitInfo(line)
if err != nil {
return err
}
coldStreak++
}
return scanner.Err()
}

// parseCommitInfo parse a commit from a line of `git log` output. Expects the
// line to be formatted according to getCommitsInfoPretty.
func parseCommitInfo(line string) (*Commit, error) {
if len(line) < 43 {
return nil, fmt.Errorf("invalid git output: %s", line)
}
ref, err := NewIDFromString(line[:40])
if err != nil {
return nil, err
}
spaceIndex := strings.IndexByte(line[41:], ' ')
if spaceIndex < 0 {
return nil, fmt.Errorf("invalid git output: %s", line)
}
unixSeconds, err := strconv.Atoi(line[41 : 41+spaceIndex])
if err != nil {
return nil, err
}
message := line[spaceIndex+42:]
return &Commit{
ID: ref,
CommitMessage: message,
Committer: &Signature{
When: time.Unix(int64(unixSeconds), 0),
},
}, nil
}
59 changes: 59 additions & 0 deletions commit_info_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package git

import (
"os"
"path/filepath"
"testing"
"time"
)

const benchmarkReposDir = "benchmark/repos/"

func setupGitRepo(url string, name string) (string, error) {
repoDir := filepath.Join(benchmarkReposDir, name)
if _, err := os.Stat(repoDir); err == nil {
return repoDir, nil
}
return repoDir, Clone(url, repoDir, CloneRepoOptions{
Mirror: false,
Bare: false,
Quiet: true,
Timeout: 5 * time.Minute,
})
}

func BenchmarkEntries_GetCommitsInfo(b *testing.B) {
benchmarks := []struct {
url string
name string
}{
{url: "https://github.com/go-gitea/gitea.git", name: "gitea"},
{url: "https://github.com/ethantkoenig/manyfiles.git", name: "manyfiles"},
{url: "https://github.com/moby/moby.git", name: "moby"},
{url: "https://github.com/golang/go.git", name: "go"},
{url: "https://github.com/torvalds/linux.git", name: "linux"},
}
for _, benchmark := range benchmarks {
var commit *Commit
var entries Entries
if repoPath, err := setupGitRepo(benchmark.url, benchmark.name); err != nil {
b.Fatal(err)
} else if repo, err := OpenRepository(repoPath); err != nil {
b.Fatal(err)
} else if commit, err = repo.GetBranchCommit("master"); err != nil {
b.Fatal(err)
} else if entries, err = commit.Tree.ListEntries(); err != nil {
b.Fatal(err)
}
entries.Sort()
b.ResetTimer()
b.Run(benchmark.name, func(b *testing.B) {
for i := 0; i < b.N; i++ {
_, err := entries.GetCommitsInfo(commit, "")
if err != nil {
b.Fatal(err)
}
}
})
}
}
Loading