From 23be734a5e57f178554dc6be28fe2740d8981909 Mon Sep 17 00:00:00 2001 From: Gervasio Marchand Date: Tue, 20 Dec 2022 19:15:13 -0800 Subject: [PATCH] Check robots.txt and use a user agent (#17) --- src/GetMoarFediverse.csproj | 1 + src/Program.cs | 39 ++++++++++++++++++++++++++++--------- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/GetMoarFediverse.csproj b/src/GetMoarFediverse.csproj index eaf9beb..3ee31cf 100644 --- a/src/GetMoarFediverse.csproj +++ b/src/GetMoarFediverse.csproj @@ -11,6 +11,7 @@ + diff --git a/src/Program.cs b/src/Program.cs index deb4187..d556a98 100644 --- a/src/Program.cs +++ b/src/Program.cs @@ -1,6 +1,7 @@ using System.Collections.Concurrent; using System.Text.Json; using GetMoarFediverse; +using TurnerSoftware.RobotsExclusionTools; var configPath = Environment.GetEnvironmentVariable("CONFIG_PATH"); if (args.Length == 1){ @@ -20,6 +21,8 @@ if (Config.Instance == null) } var client = new HttpClient(); +client.DefaultRequestHeaders.Add("User-Agent", "GetMoarFediverse"); + var authClient = new HttpClient { BaseAddress = new Uri(Config.Instance.FakeRelayUrl) @@ -32,9 +35,17 @@ if (!File.Exists(importedPath)) File.WriteAllText(importedPath, ""); } -var importedList = File.ReadAllLines(importedPath).ToList(); -var imported = importedList.ToHashSet(); -var statusesToLoadBag = new ConcurrentBag(); +ParallelOptions parallelOptions = new() +{ + MaxDegreeOfParallelism = 8 +}; + +var robotsFileParser = new RobotsFileParser(); +var sitesRobotFile = new ConcurrentDictionary(); +await Parallel.ForEachAsync(Config.Instance.Sites, parallelOptions, async (site, _) => +{ + sitesRobotFile[site.Host] = await robotsFileParser.FromUriAsync(new Uri($"http://{site.Host}/robots.txt")); +}); List<(string host, string tag)> sitesTags; if (Config.Instance.MastodonPostgresConnectionString.HasValue()) @@ -53,19 +64,29 @@ else .ToList(); } -ParallelOptions parallelOptions = new() -{ - MaxDegreeOfParallelism = 8 -}; - +var importedList = File.ReadAllLines(importedPath).ToList(); +var imported = importedList.ToHashSet(); +var statusesToLoadBag = new ConcurrentBag(); await Parallel.ForEachAsync(sitesTags, parallelOptions, async (st, _) => { var (site, tag) = st; Console.WriteLine($"Fetching tag #{tag} from {site}"); + + var url = $"https://{site}/tags/{tag}.json"; + if (sitesRobotFile.TryGetValue(site, out var robotsFile)) + { + var allowed = robotsFile.IsAllowedAccess(new Uri(url), "GetMoarFediverse"); + if (!allowed) + { + Console.WriteLine($"Scraping {url} is not allowed based on their robots.txt file"); + return; + } + } + HttpResponseMessage? response = null; try { - response = await client.GetAsync($"https://{site}/tags/{tag}.json"); + response = await client.GetAsync(url); response.EnsureSuccessStatusCode(); } catch (Exception e)