Check robots.txt and use a user agent (#17)

This commit is contained in:
Gervasio Marchand 2022-12-20 19:15:13 -08:00 committed by GitHub
parent d535512a6b
commit 23be734a5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 9 deletions

View File

@ -11,6 +11,7 @@
<ItemGroup> <ItemGroup>
<PackageReference Include="Npgsql" Version="7.0.1" /> <PackageReference Include="Npgsql" Version="7.0.1" />
<PackageReference Include="TurnerSoftware.RobotsExclusionTools" Version="0.9.1" />
</ItemGroup> </ItemGroup>
</Project> </Project>

View File

@ -1,6 +1,7 @@
using System.Collections.Concurrent; using System.Collections.Concurrent;
using System.Text.Json; using System.Text.Json;
using GetMoarFediverse; using GetMoarFediverse;
using TurnerSoftware.RobotsExclusionTools;
var configPath = Environment.GetEnvironmentVariable("CONFIG_PATH"); var configPath = Environment.GetEnvironmentVariable("CONFIG_PATH");
if (args.Length == 1){ if (args.Length == 1){
@ -20,6 +21,8 @@ if (Config.Instance == null)
} }
var client = new HttpClient(); var client = new HttpClient();
client.DefaultRequestHeaders.Add("User-Agent", "GetMoarFediverse");
var authClient = new HttpClient var authClient = new HttpClient
{ {
BaseAddress = new Uri(Config.Instance.FakeRelayUrl) BaseAddress = new Uri(Config.Instance.FakeRelayUrl)
@ -32,9 +35,17 @@ if (!File.Exists(importedPath))
File.WriteAllText(importedPath, ""); File.WriteAllText(importedPath, "");
} }
var importedList = File.ReadAllLines(importedPath).ToList(); ParallelOptions parallelOptions = new()
var imported = importedList.ToHashSet(); {
var statusesToLoadBag = new ConcurrentBag<string>(); MaxDegreeOfParallelism = 8
};
var robotsFileParser = new RobotsFileParser();
var sitesRobotFile = new ConcurrentDictionary<string, RobotsFile>();
await Parallel.ForEachAsync(Config.Instance.Sites, parallelOptions, async (site, _) =>
{
sitesRobotFile[site.Host] = await robotsFileParser.FromUriAsync(new Uri($"http://{site.Host}/robots.txt"));
});
List<(string host, string tag)> sitesTags; List<(string host, string tag)> sitesTags;
if (Config.Instance.MastodonPostgresConnectionString.HasValue()) if (Config.Instance.MastodonPostgresConnectionString.HasValue())
@ -53,19 +64,29 @@ else
.ToList(); .ToList();
} }
ParallelOptions parallelOptions = new() var importedList = File.ReadAllLines(importedPath).ToList();
{ var imported = importedList.ToHashSet();
MaxDegreeOfParallelism = 8 var statusesToLoadBag = new ConcurrentBag<string>();
};
await Parallel.ForEachAsync(sitesTags, parallelOptions, async (st, _) => await Parallel.ForEachAsync(sitesTags, parallelOptions, async (st, _) =>
{ {
var (site, tag) = st; var (site, tag) = st;
Console.WriteLine($"Fetching tag #{tag} from {site}"); Console.WriteLine($"Fetching tag #{tag} from {site}");
var url = $"https://{site}/tags/{tag}.json";
if (sitesRobotFile.TryGetValue(site, out var robotsFile))
{
var allowed = robotsFile.IsAllowedAccess(new Uri(url), "GetMoarFediverse");
if (!allowed)
{
Console.WriteLine($"Scraping {url} is not allowed based on their robots.txt file");
return;
}
}
HttpResponseMessage? response = null; HttpResponseMessage? response = null;
try try
{ {
response = await client.GetAsync($"https://{site}/tags/{tag}.json"); response = await client.GetAsync(url);
response.EnsureSuccessStatusCode(); response.EnsureSuccessStatusCode();
} }
catch (Exception e) catch (Exception e)