rclone/backend/http/http.go

774 lines
22 KiB
Go
Raw Permalink Normal View History

// Package http provides a filesystem interface using golang.org/net/http
//
// It treats HTML pages served from the endpoint as directory
// listings, and includes any links found as files.
package http
import (
"context"
"errors"
"fmt"
"io"
"mime"
"net/http"
"net/url"
"path"
"strings"
"sync"
"time"
"github.com/rclone/rclone/fs"
"github.com/rclone/rclone/fs/config/configmap"
"github.com/rclone/rclone/fs/config/configstruct"
"github.com/rclone/rclone/fs/fshttp"
"github.com/rclone/rclone/fs/hash"
"github.com/rclone/rclone/lib/rest"
"golang.org/x/net/html"
)
var (
errorReadOnly = errors.New("http remotes are read only")
timeUnset = time.Unix(0, 0)
)
func init() {
fsi := &fs.RegInfo{
Name: "http",
Description: "HTTP",
NewFs: NewFs,
CommandHelp: commandHelp,
Options: []fs.Option{{
Name: "url",
Help: "URL of HTTP host to connect to.\n\nE.g. \"https://example.com\", or \"https://user:pass@example.com\" to use a username and password.",
Required: true,
}, {
Name: "headers",
2021-08-16 11:30:01 +02:00
Help: `Set HTTP headers for all transactions.
2021-08-16 11:30:01 +02:00
Use this to set additional HTTP headers for all transactions.
The input format is comma separated list of key,value pairs. Standard
[CSV encoding](https://godoc.org/encoding/csv) may be used.
For example, to set a Cookie use 'Cookie,name=value', or '"Cookie","name=value"'.
You can set multiple headers, e.g. '"Cookie","name=value","Authorization","xxx"'.`,
Default: fs.CommaSepList{},
Advanced: true,
}, {
Name: "no_slash",
2021-08-16 11:30:01 +02:00
Help: `Set this if the site doesn't end directories with /.
Use this if your target website does not use / on the end of
directories.
A / on the end of a path is how rclone normally tells the difference
between files and directories. If this flag is set, then rclone will
treat all files with Content-Type: text/html as directories and read
URLs from them rather than downloading them.
Note that this may cause rclone to confuse genuine HTML files with
directories.`,
Default: false,
Advanced: true,
}, {
Name: "no_head",
Help: `Don't use HEAD requests.
HEAD requests are mainly used to find file sizes in dir listing.
If your site is being very slow to load then you can try this option.
Normally rclone does a HEAD request for each potential file in a
directory listing to:
- find its size
- check it really exists
- check to see if it is a directory
If you set this option, rclone will not do the HEAD request. This will mean
that directory listings are much quicker, but rclone won't have the times or
sizes of any files, and some files that don't exist may be in the listing.`,
Default: false,
Advanced: true,
}, {
Name: "no_escape",
Help: "Do not escape URL metacharacters in path names.",
Default: false,
}},
}
fs.Register(fsi)
}
// Options defines the configuration for this backend
type Options struct {
Endpoint string `config:"url"`
NoSlash bool `config:"no_slash"`
NoHead bool `config:"no_head"`
Headers fs.CommaSepList `config:"headers"`
NoEscape bool `config:"no_escape"`
}
// Fs stores the interface to the remote HTTP files
type Fs struct {
name string
root string
features *fs.Features // optional features
opt Options // options for this backend
ci *fs.ConfigInfo // global config
endpoint *url.URL
endpointURL string // endpoint as a string
httpClient *http.Client
}
// Object is a remote object that has been stat'd (so it exists, but is not necessarily open for reading)
type Object struct {
fs *Fs
remote string
size int64
modTime time.Time
contentType string
}
// statusError returns an error if the res contained an error
func statusError(res *http.Response, err error) error {
if err != nil {
return err
}
if res.StatusCode < 200 || res.StatusCode > 299 {
_ = res.Body.Close()
return fmt.Errorf("HTTP Error: %s", res.Status)
}
return nil
}
// getFsEndpoint decides if url is to be considered a file or directory,
// and returns a proper endpoint url to use for the fs.
func getFsEndpoint(ctx context.Context, client *http.Client, url string, opt *Options) (string, bool) {
// If url ends with '/' it is already a proper url always assumed to be a directory.
if url[len(url)-1] == '/' {
return url, false
}
// If url does not end with '/' we send a HEAD request to decide
// if it is directory or file, and if directory appends the missing
// '/', or if file returns the directory url to parent instead.
createFileResult := func() (string, bool) {
fs.Debugf(nil, "If path is a directory you must add a trailing '/'")
parent, _ := path.Split(url)
return parent, true
}
createDirResult := func() (string, bool) {
fs.Debugf(nil, "To avoid the initial HEAD request add a trailing '/' to the path")
return url + "/", false
}
// If HEAD requests are not allowed we just have to assume it is a file.
if opt.NoHead {
fs.Debugf(nil, "Assuming path is a file as --http-no-head is set")
return createFileResult()
}
// Use a client which doesn't follow redirects so the server
// doesn't redirect http://host/dir to http://host/dir/
noRedir := *client
noRedir.CheckRedirect = func(req *http.Request, via []*http.Request) error {
return http.ErrUseLastResponse
}
req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil)
if err != nil {
fs.Debugf(nil, "Assuming path is a file as HEAD request could not be created: %v", err)
return createFileResult()
}
addHeaders(req, opt)
res, err := noRedir.Do(req)
if err != nil {
fs.Debugf(nil, "Assuming path is a file as HEAD request could not be sent: %v", err)
return createFileResult()
}
if res.StatusCode == http.StatusNotFound {
fs.Debugf(nil, "Assuming path is a directory as HEAD response is it does not exist as a file (%s)", res.Status)
return createDirResult()
}
if res.StatusCode == http.StatusMovedPermanently ||
res.StatusCode == http.StatusFound ||
res.StatusCode == http.StatusSeeOther ||
res.StatusCode == http.StatusTemporaryRedirect ||
res.StatusCode == http.StatusPermanentRedirect {
redir := res.Header.Get("Location")
if redir != "" {
if redir[len(redir)-1] == '/' {
fs.Debugf(nil, "Assuming path is a directory as HEAD response is redirect (%s) to a path that ends with '/': %s", res.Status, redir)
return createDirResult()
}
fs.Debugf(nil, "Assuming path is a file as HEAD response is redirect (%s) to a path that does not end with '/': %s", res.Status, redir)
return createFileResult()
}
fs.Debugf(nil, "Assuming path is a file as HEAD response is redirect (%s) but no location header", res.Status)
return createFileResult()
}
if res.StatusCode < 200 || res.StatusCode > 299 {
// Example is 403 (http.StatusForbidden) for servers not allowing HEAD requests.
fs.Debugf(nil, "Assuming path is a file as HEAD response is an error (%s)", res.Status)
return createFileResult()
}
fs.Debugf(nil, "Assuming path is a file as HEAD response is success (%s)", res.Status)
return createFileResult()
}
// Make the http connection with opt
func (f *Fs) httpConnection(ctx context.Context, opt *Options) (isFile bool, err error) {
if len(opt.Headers)%2 != 0 {
return false, errors.New("odd number of headers supplied")
}
if !strings.HasSuffix(opt.Endpoint, "/") {
opt.Endpoint += "/"
}
// Parse the endpoint and stick the root onto it
base, err := url.Parse(opt.Endpoint)
if err != nil {
return false, err
}
u, err := rest.URLJoin(base, rest.URLPathEscape(f.root))
if err != nil {
return false, err
}
client := fshttp.NewClient(ctx)
endpoint, isFile := getFsEndpoint(ctx, client, u.String(), opt)
fs.Debugf(nil, "Root: %s", endpoint)
u, err = url.Parse(endpoint)
if err != nil {
return false, err
}
// Update f with the new parameters
f.httpClient = client
f.endpoint = u
f.endpointURL = u.String()
return isFile, nil
}
// NewFs creates a new Fs object from the name and root. It connects to
// the host specified in the config file.
func NewFs(ctx context.Context, name, root string, m configmap.Mapper) (fs.Fs, error) {
// Parse config into Options struct
opt := new(Options)
err := configstruct.Set(m, opt)
if err != nil {
return nil, err
}
ci := fs.GetConfig(ctx)
f := &Fs{
name: name,
root: root,
opt: *opt,
ci: ci,
}
f.features = (&fs.Features{
CanHaveEmptyDirectories: true,
}).Fill(ctx, f)
// Make the http connection
isFile, err := f.httpConnection(ctx, opt)
if err != nil {
return nil, err
}
if isFile {
// return an error with an fs which points to the parent
return f, fs.ErrorIsFile
}
if !strings.HasSuffix(f.endpointURL, "/") {
return nil, errors.New("internal error: url doesn't end with /")
}
return f, nil
}
// Name returns the configured name of the file system
func (f *Fs) Name() string {
return f.name
}
// Root returns the root for the filesystem
func (f *Fs) Root() string {
return f.root
}
// String returns the URL for the filesystem
func (f *Fs) String() string {
return f.endpointURL
}
// Features returns the optional features of this Fs
func (f *Fs) Features() *fs.Features {
return f.features
}
// Precision is the remote http file system's modtime precision, which we have no way of knowing. We estimate at 1s
func (f *Fs) Precision() time.Duration {
return time.Second
}
// NewObject creates a new remote http file object
func (f *Fs) NewObject(ctx context.Context, remote string) (fs.Object, error) {
o := &Object{
fs: f,
remote: remote,
}
err := o.head(ctx)
if err != nil {
return nil, err
}
return o, nil
}
// Join's the remote onto the base URL
func (f *Fs) url(remote string) string {
if f.opt.NoEscape {
// Directly concatenate without escaping, no_escape behavior
return f.endpointURL + remote
}
// Default behavior
return f.endpointURL + rest.URLPathEscape(remote)
}
// Errors returned by parseName
var (
errURLJoinFailed = errors.New("URLJoin failed")
errFoundQuestionMark = errors.New("found ? in URL")
errHostMismatch = errors.New("host mismatch")
errSchemeMismatch = errors.New("scheme mismatch")
errNotUnderRoot = errors.New("not under root")
errNameIsEmpty = errors.New("name is empty")
errNameContainsSlash = errors.New("name contains /")
)
// parseName turns a name as found in the page into a remote path or returns an error
func parseName(base *url.URL, name string) (string, error) {
// make URL absolute
u, err := rest.URLJoin(base, name)
if err != nil {
return "", errURLJoinFailed
}
// check it doesn't have URL parameters
uStr := u.String()
if strings.Contains(uStr, "?") {
return "", errFoundQuestionMark
}
// check that this is going back to the same host and scheme
if base.Host != u.Host {
return "", errHostMismatch
}
if base.Scheme != u.Scheme {
return "", errSchemeMismatch
}
// check has path prefix
if !strings.HasPrefix(u.Path, base.Path) {
return "", errNotUnderRoot
}
// calculate the name relative to the base
name = u.Path[len(base.Path):]
2019-02-07 18:41:17 +01:00
// mustn't be empty
if name == "" {
return "", errNameIsEmpty
}
// mustn't contain a / - we are looking for a single level directory
slash := strings.Index(name, "/")
if slash >= 0 && slash != len(name)-1 {
return "", errNameContainsSlash
}
return name, nil
}
// Parse turns HTML for a directory into names
// base should be the base URL to resolve any relative names from
func parse(base *url.URL, in io.Reader) (names []string, err error) {
doc, err := html.Parse(in)
if err != nil {
return nil, err
}
2019-02-08 14:58:22 +01:00
var (
walk func(*html.Node)
seen = make(map[string]struct{})
)
walk = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
name, err := parseName(base, a.Val)
if err == nil {
2019-02-08 14:58:22 +01:00
if _, found := seen[name]; !found {
names = append(names, name)
seen[name] = struct{}{}
}
}
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
walk(c)
}
}
walk(doc)
return names, nil
}
// Adds the configured headers to the request if any
func addHeaders(req *http.Request, opt *Options) {
for i := 0; i < len(opt.Headers); i += 2 {
key := opt.Headers[i]
value := opt.Headers[i+1]
req.Header.Add(key, value)
}
}
// Adds the configured headers to the request if any
func (f *Fs) addHeaders(req *http.Request) {
addHeaders(req, &f.opt)
}
// Read the directory passed in
func (f *Fs) readDir(ctx context.Context, dir string) (names []string, err error) {
URL := f.url(dir)
u, err := url.Parse(URL)
if err != nil {
return nil, fmt.Errorf("failed to readDir: %w", err)
}
if !strings.HasSuffix(URL, "/") {
return nil, fmt.Errorf("internal error: readDir URL %q didn't end in /", URL)
}
// Do the request
req, err := http.NewRequestWithContext(ctx, "GET", URL, nil)
if err != nil {
return nil, fmt.Errorf("readDir failed: %w", err)
}
f.addHeaders(req)
res, err := f.httpClient.Do(req)
2019-02-08 18:40:40 +01:00
if err == nil {
defer fs.CheckClose(res.Body, &err)
if res.StatusCode == http.StatusNotFound {
return nil, fs.ErrorDirNotFound
}
}
err = statusError(res, err)
if err != nil {
return nil, fmt.Errorf("failed to readDir: %w", err)
}
contentType := strings.SplitN(res.Header.Get("Content-Type"), ";", 2)[0]
switch contentType {
case "text/html":
names, err = parse(u, res.Body)
if err != nil {
return nil, fmt.Errorf("readDir: %w", err)
}
default:
return nil, fmt.Errorf("can't parse content type %q", contentType)
}
return names, nil
}
// List the objects and directories in dir into entries. The
// entries can be returned in any order but should be for a
// complete directory.
//
// dir should be "" to list the root, and should not have
// trailing slashes.
//
// This should return ErrDirNotFound if the directory isn't
// found.
func (f *Fs) List(ctx context.Context, dir string) (entries fs.DirEntries, err error) {
if !strings.HasSuffix(dir, "/") && dir != "" {
dir += "/"
}
names, err := f.readDir(ctx, dir)
if err != nil {
return nil, fmt.Errorf("error listing %q: %w", dir, err)
}
var (
entriesMu sync.Mutex // to protect entries
wg sync.WaitGroup
checkers = f.ci.Checkers
in = make(chan string, checkers)
)
add := func(entry fs.DirEntry) {
entriesMu.Lock()
entries = append(entries, entry)
entriesMu.Unlock()
}
for i := 0; i < checkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for remote := range in {
file := &Object{
fs: f,
remote: remote,
}
switch err := file.head(ctx); err {
case nil:
add(file)
case fs.ErrorNotAFile:
// ...found a directory not a file
add(fs.NewDir(remote, time.Time{}))
default:
fs.Debugf(remote, "skipping because of error: %v", err)
}
}
}()
}
for _, name := range names {
isDir := name[len(name)-1] == '/'
name = strings.TrimRight(name, "/")
remote := path.Join(dir, name)
if isDir {
add(fs.NewDir(remote, time.Time{}))
} else {
in <- remote
}
}
close(in)
wg.Wait()
return entries, nil
}
// Put in to the remote path with the modTime given of the given size
//
// May create the object even if it returns an error - if so
// will return the object and the error, otherwise will return
// nil and the error
func (f *Fs) Put(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// PutStream uploads to the remote path with the modTime given of indeterminate size
func (f *Fs) PutStream(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) (fs.Object, error) {
return nil, errorReadOnly
}
// Fs is the filesystem this remote http file object is located within
func (o *Object) Fs() fs.Info {
return o.fs
}
// String returns the URL to the remote HTTP file
func (o *Object) String() string {
if o == nil {
return "<nil>"
}
return o.remote
}
// Remote the name of the remote HTTP file, relative to the fs root
func (o *Object) Remote() string {
return o.remote
}
// Hash returns "" since HTTP (in Go or OpenSSH) doesn't support remote calculation of hashes
func (o *Object) Hash(ctx context.Context, r hash.Type) (string, error) {
return "", hash.ErrUnsupported
}
// Size returns the size in bytes of the remote http file
func (o *Object) Size() int64 {
return o.size
}
// ModTime returns the modification time of the remote http file
func (o *Object) ModTime(ctx context.Context) time.Time {
return o.modTime
}
// url returns the native url of the object
func (o *Object) url() string {
return o.fs.url(o.remote)
}
// head sends a HEAD request to update info fields in the Object
func (o *Object) head(ctx context.Context) error {
if o.fs.opt.NoHead {
o.size = -1
o.modTime = timeUnset
o.contentType = fs.MimeType(ctx, o)
return nil
}
url := o.url()
req, err := http.NewRequestWithContext(ctx, "HEAD", url, nil)
if err != nil {
return fmt.Errorf("stat failed: %w", err)
}
o.fs.addHeaders(req)
res, err := o.fs.httpClient.Do(req)
if err == nil && res.StatusCode == http.StatusNotFound {
return fs.ErrorObjectNotFound
}
err = statusError(res, err)
if err != nil {
return fmt.Errorf("failed to stat: %w", err)
}
return o.decodeMetadata(ctx, res)
}
// decodeMetadata updates info fields in the Object according to HTTP response headers
func (o *Object) decodeMetadata(ctx context.Context, res *http.Response) error {
t, err := http.ParseTime(res.Header.Get("Last-Modified"))
if err != nil {
t = timeUnset
}
o.modTime = t
o.contentType = res.Header.Get("Content-Type")
o.size = rest.ParseSizeFromHeaders(res.Header)
// If NoSlash is set then check ContentType to see if it is a directory
if o.fs.opt.NoSlash {
mediaType, _, err := mime.ParseMediaType(o.contentType)
if err != nil {
return fmt.Errorf("failed to parse Content-Type: %q: %w", o.contentType, err)
}
if mediaType == "text/html" {
return fs.ErrorNotAFile
}
}
return nil
}
// SetModTime sets the modification and access time to the specified time
//
// it also updates the info field
func (o *Object) SetModTime(ctx context.Context, modTime time.Time) error {
return errorReadOnly
}
// Storable returns whether the remote http file is a regular file (not a directory, symbolic link, block device, character device, named pipe, etc.)
func (o *Object) Storable() bool {
return true
}
// Open a remote http file object for reading. Seek is supported
func (o *Object) Open(ctx context.Context, options ...fs.OpenOption) (in io.ReadCloser, err error) {
url := o.url()
req, err := http.NewRequestWithContext(ctx, "GET", url, nil)
if err != nil {
return nil, fmt.Errorf("Open failed: %w", err)
}
// Add optional headers
for k, v := range fs.OpenOptionHeaders(options) {
req.Header.Add(k, v)
}
o.fs.addHeaders(req)
// Do the request
res, err := o.fs.httpClient.Do(req)
err = statusError(res, err)
if err != nil {
return nil, fmt.Errorf("Open failed: %w", err)
}
if err = o.decodeMetadata(ctx, res); err != nil {
return nil, fmt.Errorf("decodeMetadata failed: %w", err)
}
return res.Body, nil
}
// Hashes returns hash.HashNone to indicate remote hashing is unavailable
func (f *Fs) Hashes() hash.Set {
return hash.Set(hash.None)
}
// Mkdir makes the root directory of the Fs object
func (f *Fs) Mkdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// Remove a remote http file object
func (o *Object) Remove(ctx context.Context) error {
return errorReadOnly
}
// Rmdir removes the root directory of the Fs object
func (f *Fs) Rmdir(ctx context.Context, dir string) error {
return errorReadOnly
}
// Update in to the object with the modTime given of the given size
func (o *Object) Update(ctx context.Context, in io.Reader, src fs.ObjectInfo, options ...fs.OpenOption) error {
return errorReadOnly
}
// MimeType of an Object if known, "" otherwise
func (o *Object) MimeType(ctx context.Context) string {
return o.contentType
}
var commandHelp = []fs.CommandHelp{{
Name: "set",
Short: "Set command for updating the config parameters.",
Long: `This set command can be used to update the config parameters
for a running http backend.
Usage Examples:
rclone backend set remote: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=remote: [-o opt_name=opt_value] [-o opt_name2=opt_value2]
rclone rc backend/command command=set fs=remote: -o url=https://example.com
The option keys are named as they are in the config file.
This rebuilds the connection to the http backend when it is called with
the new parameters. Only new parameters need be passed as the values
will default to those currently in use.
It doesn't return anything.
`,
}}
// Command the backend to run a named command
//
// The command run is name
// args may be used to read arguments from
// opts may be used to read optional arguments from
//
// The result should be capable of being JSON encoded
// If it is a string or a []string it will be shown to the user
// otherwise it will be JSON encoded and shown to the user like that
func (f *Fs) Command(ctx context.Context, name string, arg []string, opt map[string]string) (out interface{}, err error) {
switch name {
case "set":
newOpt := f.opt
err := configstruct.Set(configmap.Simple(opt), &newOpt)
if err != nil {
return nil, fmt.Errorf("reading config: %w", err)
}
_, err = f.httpConnection(ctx, &newOpt)
if err != nil {
return nil, fmt.Errorf("updating session: %w", err)
}
f.opt = newOpt
keys := []string{}
for k := range opt {
keys = append(keys, k)
}
fs.Logf(f, "Updated config values: %s", strings.Join(keys, ", "))
return nil, nil
default:
return nil, fs.ErrorCommandNotFound
}
}
// Check the interfaces are satisfied
var (
_ fs.Fs = &Fs{}
_ fs.PutStreamer = &Fs{}
_ fs.Object = &Object{}
_ fs.MimeTyper = &Object{}
_ fs.Commander = &Fs{}
)