rclone/backend/hasher/object.go
nielash 9b650d3517 hasher: look for cached hash if passed hash unexpectedly blank
Before this change, Hasher did not check whether a "passed hash" (hashtype
natively supported by the wrapped backend) returned from a backend was blank,
and would sometimes return a blank hash to the caller even when a non-blank hash
was already stored in the db. This caused issues with, for example, Google
Drive, which has SHA1 / SHA256 hashes for some files but not others
(https://rclone.org/drive/#sha1-or-sha256-hashes-may-be-missing) and sometimes also
does not have hashes for very recently modified files.

After this change, Hasher will check if the received "passed hash" is
unexpectedly blank, and if so, it will continue to try other enabled methods,
such as retrieving a value from the database, or possibly regenerating it.

https://forum.rclone.org/t/hasher-with-gdrive-backend-does-not-return-sha1-sha256-for-old-files/44680/9?u=nielash
2024-03-09 11:58:02 +00:00

312 lines
7.9 KiB
Go

package hasher
import (
"context"
"errors"
"fmt"
"io"
"path"
"time"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/fs/operations"
)
// obtain hash for an object
func (o *Object) getHash(ctx context.Context, hashType hash.Type) (string, error) {
maxAge := time.Duration(o.f.opt.MaxAge)
if maxAge <= 0 {
return "", nil
}
fp := o.fingerprint(ctx)
if fp == "" {
return "", errors.New("fingerprint failed")
}
return o.f.getRawHash(ctx, hashType, o.Remote(), fp, maxAge)
}
// obtain hash for a path
func (f *Fs) getRawHash(ctx context.Context, hashType hash.Type, remote, fp string, age time.Duration) (string, error) {
key := path.Join(f.Fs.Root(), remote)
op := &kvGet{
key: key,
fp: fp,
hash: hashType.String(),
age: age,
}
err := f.db.Do(false, op)
return op.val, err
}
// put new hashes for an object
func (o *Object) putHashes(ctx context.Context, rawHashes hashMap) error {
if o.f.opt.MaxAge <= 0 {
return nil
}
fp := o.fingerprint(ctx)
if fp == "" {
return nil
}
key := path.Join(o.f.Fs.Root(), o.Remote())
hashes := operations.HashSums{}
for hashType, hashVal := range rawHashes {
hashes[hashType.String()] = hashVal
}
return o.f.putRawHashes(ctx, key, fp, hashes)
}
// set hashes for a path without any validation
func (f *Fs) putRawHashes(ctx context.Context, key, fp string, hashes operations.HashSums) error {
return f.db.Do(true, &kvPut{
key: key,
fp: fp,
hashes: hashes,
age: time.Duration(f.opt.MaxAge),
})
}
// Hash returns the selected checksum of the file or "" if unavailable.
func (o *Object) Hash(ctx context.Context, hashType hash.Type) (hashVal string, err error) {
f := o.f
if f.passHashes.Contains(hashType) {
fs.Debugf(o, "pass %s", hashType)
hashVal, err = o.Object.Hash(ctx, hashType)
if hashVal != "" {
return hashVal, err
}
if err != nil {
fs.Debugf(o, "error passing %s: %v", hashType, err)
}
fs.Debugf(o, "passed %s is blank -- trying other methods", hashType)
}
if !f.suppHashes.Contains(hashType) {
fs.Debugf(o, "unsupp %s", hashType)
return "", hash.ErrUnsupported
}
if hashVal, err = o.getHash(ctx, hashType); err != nil {
fs.Debugf(o, "getHash: %v", err)
err = nil
hashVal = ""
}
if hashVal != "" {
fs.Debugf(o, "cached %s = %q", hashType, hashVal)
return hashVal, nil
}
if f.slowHashes.Contains(hashType) {
fs.Debugf(o, "slow %s", hashType)
hashVal, err = o.Object.Hash(ctx, hashType)
if err == nil && hashVal != "" && f.keepHashes.Contains(hashType) {
if err = o.putHashes(ctx, hashMap{hashType: hashVal}); err != nil {
fs.Debugf(o, "putHashes: %v", err)
err = nil
}
}
return hashVal, err
}
if f.autoHashes.Contains(hashType) && o.Size() < int64(f.opt.AutoSize) {
_ = o.updateHashes(ctx)
if hashVal, err = o.getHash(ctx, hashType); err != nil {
fs.Debugf(o, "auto %s = %q (%v)", hashType, hashVal, err)
err = nil
}
}
return hashVal, err
}
// updateHashes performs implicit "rclone hashsum --download" and updates cache.
func (o *Object) updateHashes(ctx context.Context) error {
r, err := o.Open(ctx)
if err != nil {
fs.Infof(o, "update failed (open): %v", err)
return err
}
defer func() {
_ = r.Close()
}()
if _, err = io.Copy(io.Discard, r); err != nil {
fs.Infof(o, "update failed (copy): %v", err)
return err
}
return nil
}
// Update the object with the given data, time and size.
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
_ = o.f.pruneHash(src.Remote())
return o.Object.Update(ctx, in, src, options...)
}
// Remove an object.
func (o *Object) Remove(ctx context.Context) error {
_ = o.f.pruneHash(o.Remote())
return o.Object.Remove(ctx)
}
// SetModTime sets the modification time of the file.
// Also prunes the cache entry when modtime changes so that
// touching a file will trigger checksum recalculation even
// on backends that don't provide modTime with fingerprint.
func (o *Object) SetModTime(ctx context.Context, mtime time.Time) error {
if mtime != o.Object.ModTime(ctx) {
_ = o.f.pruneHash(o.Remote())
}
return o.Object.SetModTime(ctx, mtime)
}
// Open opens the file for read.
// Full reads will also update object hashes.
func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (r io.ReadCloser, err error) {
size := o.Size()
var offset, limit int64 = 0, -1
for _, option := range options {
switch opt := option.(type) {
case *fs.SeekOption:
offset = opt.Offset
case *fs.RangeOption:
offset, limit = opt.Decode(size)
}
}
if offset < 0 {
return nil, errors.New("invalid offset")
}
if limit < 0 {
limit = size - offset
}
if r, err = o.Object.Open(ctx, options...); err != nil {
return nil, err
}
if offset != 0 || limit < size {
// It's a partial read
return r, err
}
return o.f.newHashingReader(ctx, r, func(sums hashMap) {
if err := o.putHashes(ctx, sums); err != nil {
fs.Infof(o, "auto hashing error: %v", err)
}
})
}
// Put data into the remote path with given modTime and size
func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
var (
o fs.Object
common hash.Set
rehash bool
hashes hashMap
)
if fsrc := src.Fs(); fsrc != nil {
common = fsrc.Hashes().Overlap(f.keepHashes)
// Rehash if source does not have all required hashes or hashing is slow
rehash = fsrc.Features().SlowHash || common != f.keepHashes
}
wrapIn := in
if rehash {
r, err := f.newHashingReader(ctx, in, func(sums hashMap) {
hashes = sums
})
fs.Debugf(src, "Rehash in-fly due to incomplete or slow source set %v (err: %v)", common, err)
if err == nil {
wrapIn = r
} else {
rehash = false
}
}
_ = f.pruneHash(src.Remote())
oResult, err := f.Fs.Put(ctx, wrapIn, src, options...)
o, err = f.wrapObject(oResult, err)
if err != nil {
return nil, err
}
if !rehash {
hashes = hashMap{}
for _, ht := range common.Array() {
if h, e := src.Hash(ctx, ht); e == nil && h != "" {
hashes[ht] = h
}
}
}
if len(hashes) > 0 {
err := o.(*Object).putHashes(ctx, hashes)
fs.Debugf(o, "Applied %d source hashes, err: %v", len(hashes), err)
}
return o, err
}
type hashingReader struct {
rd io.Reader
hasher *hash.MultiHasher
fun func(hashMap)
}
func (f *Fs) newHashingReader(ctx context.Context, rd io.Reader, fun func(hashMap)) (*hashingReader, error) {
hasher, err := hash.NewMultiHasherTypes(f.keepHashes)
if err != nil {
return nil, err
}
hr := &hashingReader{
rd: rd,
hasher: hasher,
fun: fun,
}
return hr, nil
}
func (r *hashingReader) Read(p []byte) (n int, err error) {
n, err = r.rd.Read(p)
if err != nil && err != io.EOF {
r.hasher = nil
}
if r.hasher != nil {
if _, errHash := r.hasher.Write(p[:n]); errHash != nil {
r.hasher = nil
err = errHash
}
}
if err == io.EOF && r.hasher != nil {
r.fun(r.hasher.Sums())
r.hasher = nil
}
return
}
func (r *hashingReader) Close() error {
if rc, ok := r.rd.(io.ReadCloser); ok {
return rc.Close()
}
return nil
}
// Return object fingerprint or empty string in case of errors
//
// Note that we can't use the generic `fs.Fingerprint` here because
// this fingerprint is used to pick _derived hashes_ that are slow
// to calculate or completely unsupported by the base remote.
//
// The hasher fingerprint must be based on `fsHash`, the first _fast_
// hash supported _by the underlying remote_ (if there is one),
// while `fs.Fingerprint` would select a hash _produced by hasher_
// creating unresolvable fingerprint loop.
func (o *Object) fingerprint(ctx context.Context) string {
size := o.Object.Size()
timeStr := "-"
if o.f.fpTime {
timeStr = o.Object.ModTime(ctx).UTC().Format(timeFormat)
if timeStr == "" {
return ""
}
}
hashStr := "-"
if o.f.fpHash != hash.None {
var err error
hashStr, err = o.Object.Hash(ctx, o.f.fpHash)
if hashStr == "" || err != nil {
return ""
}
}
return fmt.Sprintf("%d,%s,%s", size, timeStr, hashStr)
}