mirror of
https://github.com/g3rv4/GetMoarFediverse.git
synced 2024-11-21 23:23:09 +01:00
Check robots.txt and use a user agent (#17)
This commit is contained in:
parent
d535512a6b
commit
23be734a5e
@ -11,6 +11,7 @@
|
|||||||
|
|
||||||
<ItemGroup>
|
<ItemGroup>
|
||||||
<PackageReference Include="Npgsql" Version="7.0.1" />
|
<PackageReference Include="Npgsql" Version="7.0.1" />
|
||||||
|
<PackageReference Include="TurnerSoftware.RobotsExclusionTools" Version="0.9.1" />
|
||||||
</ItemGroup>
|
</ItemGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
using System.Collections.Concurrent;
|
using System.Collections.Concurrent;
|
||||||
using System.Text.Json;
|
using System.Text.Json;
|
||||||
using GetMoarFediverse;
|
using GetMoarFediverse;
|
||||||
|
using TurnerSoftware.RobotsExclusionTools;
|
||||||
|
|
||||||
var configPath = Environment.GetEnvironmentVariable("CONFIG_PATH");
|
var configPath = Environment.GetEnvironmentVariable("CONFIG_PATH");
|
||||||
if (args.Length == 1){
|
if (args.Length == 1){
|
||||||
@ -20,6 +21,8 @@ if (Config.Instance == null)
|
|||||||
}
|
}
|
||||||
|
|
||||||
var client = new HttpClient();
|
var client = new HttpClient();
|
||||||
|
client.DefaultRequestHeaders.Add("User-Agent", "GetMoarFediverse");
|
||||||
|
|
||||||
var authClient = new HttpClient
|
var authClient = new HttpClient
|
||||||
{
|
{
|
||||||
BaseAddress = new Uri(Config.Instance.FakeRelayUrl)
|
BaseAddress = new Uri(Config.Instance.FakeRelayUrl)
|
||||||
@ -32,9 +35,17 @@ if (!File.Exists(importedPath))
|
|||||||
File.WriteAllText(importedPath, "");
|
File.WriteAllText(importedPath, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
var importedList = File.ReadAllLines(importedPath).ToList();
|
ParallelOptions parallelOptions = new()
|
||||||
var imported = importedList.ToHashSet();
|
{
|
||||||
var statusesToLoadBag = new ConcurrentBag<string>();
|
MaxDegreeOfParallelism = 8
|
||||||
|
};
|
||||||
|
|
||||||
|
var robotsFileParser = new RobotsFileParser();
|
||||||
|
var sitesRobotFile = new ConcurrentDictionary<string, RobotsFile>();
|
||||||
|
await Parallel.ForEachAsync(Config.Instance.Sites, parallelOptions, async (site, _) =>
|
||||||
|
{
|
||||||
|
sitesRobotFile[site.Host] = await robotsFileParser.FromUriAsync(new Uri($"http://{site.Host}/robots.txt"));
|
||||||
|
});
|
||||||
|
|
||||||
List<(string host, string tag)> sitesTags;
|
List<(string host, string tag)> sitesTags;
|
||||||
if (Config.Instance.MastodonPostgresConnectionString.HasValue())
|
if (Config.Instance.MastodonPostgresConnectionString.HasValue())
|
||||||
@ -53,19 +64,29 @@ else
|
|||||||
.ToList();
|
.ToList();
|
||||||
}
|
}
|
||||||
|
|
||||||
ParallelOptions parallelOptions = new()
|
var importedList = File.ReadAllLines(importedPath).ToList();
|
||||||
{
|
var imported = importedList.ToHashSet();
|
||||||
MaxDegreeOfParallelism = 8
|
var statusesToLoadBag = new ConcurrentBag<string>();
|
||||||
};
|
|
||||||
|
|
||||||
await Parallel.ForEachAsync(sitesTags, parallelOptions, async (st, _) =>
|
await Parallel.ForEachAsync(sitesTags, parallelOptions, async (st, _) =>
|
||||||
{
|
{
|
||||||
var (site, tag) = st;
|
var (site, tag) = st;
|
||||||
Console.WriteLine($"Fetching tag #{tag} from {site}");
|
Console.WriteLine($"Fetching tag #{tag} from {site}");
|
||||||
|
|
||||||
|
var url = $"https://{site}/tags/{tag}.json";
|
||||||
|
if (sitesRobotFile.TryGetValue(site, out var robotsFile))
|
||||||
|
{
|
||||||
|
var allowed = robotsFile.IsAllowedAccess(new Uri(url), "GetMoarFediverse");
|
||||||
|
if (!allowed)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"Scraping {url} is not allowed based on their robots.txt file");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
HttpResponseMessage? response = null;
|
HttpResponseMessage? response = null;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
response = await client.GetAsync($"https://{site}/tags/{tag}.json");
|
response = await client.GetAsync(url);
|
||||||
response.EnsureSuccessStatusCode();
|
response.EnsureSuccessStatusCode();
|
||||||
}
|
}
|
||||||
catch (Exception e)
|
catch (Exception e)
|
||||||
|
Loading…
Reference in New Issue
Block a user