mirror of
https://github.com/rclone/rclone.git
synced 2025-01-21 21:58:58 +01:00
dirtree: fix performance with large directories of directories and --fast-list
Before this change if using --fast-list on a directory with more than a few thousand directories in it DirTree.CheckParents became very slow taking up to 24 hours for a directory with 1,000,000 directories in it. This is because it becomes an O(N²) operation as DirTree.Find has to search each directory in a linear fashion as it is stored as a slice. This patch fixes the problem by scanning the DirTree for directories before starting the CheckParents process so it never has to call DirTree.Find. After the fix calling DirTree.CheckParents on a directory with 1,000,000 directories in it will take about 1 second. Anything which calls DirTree.Find can potentially have bad performance so in the future we should redesign the DirTree to use a different underlying datastructure or have an index. https://forum.rclone.org/t/almost-24-hours-cpu-compute-time-during-sync-between-two-large-s3-buckets/39375/
This commit is contained in:
parent
a8ca18165e
commit
07133b892d
@ -63,10 +63,12 @@ func (dt DirTree) AddEntry(entry fs.DirEntry) {
|
||||
panic("unknown entry type")
|
||||
}
|
||||
remoteParent := parentDir(entry.Remote())
|
||||
dt.CheckParent("", remoteParent)
|
||||
dt.checkParent("", remoteParent, nil)
|
||||
}
|
||||
|
||||
// Find returns the DirEntry for filePath or nil if not found
|
||||
//
|
||||
// None that Find does a O(N) search so can be slow
|
||||
func (dt DirTree) Find(filePath string) (parentPath string, entry fs.DirEntry) {
|
||||
parentPath = parentDir(filePath)
|
||||
for _, entry := range dt[parentPath] {
|
||||
@ -77,23 +79,52 @@ func (dt DirTree) Find(filePath string) (parentPath string, entry fs.DirEntry) {
|
||||
return parentPath, nil
|
||||
}
|
||||
|
||||
// CheckParent checks that dirPath has a *Dir in its parent
|
||||
func (dt DirTree) CheckParent(root, dirPath string) {
|
||||
if dirPath == root {
|
||||
return
|
||||
// checkParent checks that dirPath has a *Dir in its parent
|
||||
//
|
||||
// If dirs is not nil it must contain entries for every *Dir found in
|
||||
// the tree. It is used to speed up the checking when calling this
|
||||
// repeatedly.
|
||||
func (dt DirTree) checkParent(root, dirPath string, dirs map[string]struct{}) {
|
||||
var parentPath string
|
||||
for {
|
||||
if dirPath == root {
|
||||
return
|
||||
}
|
||||
// Can rely on dirs to have all directories in it so
|
||||
// we don't need to call Find.
|
||||
if dirs != nil {
|
||||
if _, found := dirs[dirPath]; found {
|
||||
return
|
||||
}
|
||||
parentPath = parentDir(dirPath)
|
||||
} else {
|
||||
var entry fs.DirEntry
|
||||
parentPath, entry = dt.Find(dirPath)
|
||||
if entry != nil {
|
||||
return
|
||||
}
|
||||
}
|
||||
dt[parentPath] = append(dt[parentPath], fs.NewDir(dirPath, time.Now()))
|
||||
if dirs != nil {
|
||||
dirs[dirPath] = struct{}{}
|
||||
}
|
||||
dirPath = parentPath
|
||||
}
|
||||
parentPath, entry := dt.Find(dirPath)
|
||||
if entry != nil {
|
||||
return
|
||||
}
|
||||
dt[parentPath] = append(dt[parentPath], fs.NewDir(dirPath, time.Now()))
|
||||
dt.CheckParent(root, parentPath)
|
||||
}
|
||||
|
||||
// CheckParents checks every directory in the tree has *Dir in its parent
|
||||
func (dt DirTree) CheckParents(root string) {
|
||||
dirs := make(map[string]struct{})
|
||||
// Find all the directories and stick them in dirs
|
||||
for _, entries := range dt {
|
||||
for _, entry := range entries {
|
||||
if _, ok := entry.(fs.Directory); ok {
|
||||
dirs[entry.Remote()] = struct{}{}
|
||||
}
|
||||
}
|
||||
}
|
||||
for dirPath := range dt {
|
||||
dt.CheckParent(root, dirPath)
|
||||
dt.checkParent(root, dirPath, dirs)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
package dirtree
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"github.com/rclone/rclone/fstest/mockdir"
|
||||
@ -108,7 +109,7 @@ func TestDirTreeCheckParent(t *testing.T) {
|
||||
sausage
|
||||
`, dt.String())
|
||||
|
||||
dt.CheckParent("", "dir/subdir")
|
||||
dt.checkParent("", "dir/subdir", nil)
|
||||
|
||||
assert.Equal(t, `/
|
||||
dir/
|
||||
@ -200,3 +201,21 @@ dir2/
|
||||
`, dt.String())
|
||||
|
||||
}
|
||||
|
||||
func BenchmarkCheckParents(b *testing.B) {
|
||||
for _, N := range []int{1e2, 1e3, 1e4, 1e5, 1e6} {
|
||||
b.Run(fmt.Sprintf("%d", N), func(b *testing.B) {
|
||||
b.StopTimer()
|
||||
dt := New()
|
||||
for i := 0; i < N; i++ {
|
||||
remote := fmt.Sprintf("dir%09d/file%09d.txt", i, 1)
|
||||
o := mockobject.New(remote)
|
||||
dt.Add(o)
|
||||
}
|
||||
b.StartTimer()
|
||||
for n := 0; n < b.N; n++ {
|
||||
dt.CheckParents("")
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user