From d63f8916670d7c8889aa547225007f829fd83e66 Mon Sep 17 00:00:00 2001 From: Alicia Sykes Date: Sat, 12 Aug 2023 21:57:24 +0100 Subject: [PATCH] Adds API endpoint to fetch data from Wayback Machine --- api/archives.js | 83 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 api/archives.js diff --git a/api/archives.js b/api/archives.js new file mode 100644 index 0000000..1e9ab27 --- /dev/null +++ b/api/archives.js @@ -0,0 +1,83 @@ +const axios = require('axios'); +const middleware = require('./_common/middleware'); + +const convertTimestampToDate = (timestamp) => { + const [year, month, day, hour, minute, second] = [ + timestamp.slice(0, 4), + timestamp.slice(4, 6) - 1, + timestamp.slice(6, 8), + timestamp.slice(8, 10), + timestamp.slice(10, 12), + timestamp.slice(12, 14), + ].map(num => parseInt(num, 10)); + + return new Date(year, month, day, hour, minute, second); +} + +const countPageChanges = (results) => { + let prevDigest = null; + return results.reduce((acc, curr) => { + if (curr[2] !== prevDigest) { + prevDigest = curr[2]; + return acc + 1; + } + return acc; + }, -1); +} + +const getAveragePageSize = (scans) => { + const totalSize = scans.map(scan => parseInt(scan[3], 10)).reduce((sum, size) => sum + size, 0); + return Math.round(totalSize / scans.length); +}; + +const getScanFrequency = (firstScan, lastScan, totalScans, changeCount) => { + const formatToTwoDecimal = num => parseFloat(num.toFixed(2)); + + const dayFactor = (lastScan - firstScan) / (1000 * 60 * 60 * 24); + const daysBetweenScans = formatToTwoDecimal(dayFactor / totalScans); + const daysBetweenChanges = formatToTwoDecimal(dayFactor / changeCount); + const scansPerDay = formatToTwoDecimal((totalScans - 1) / daysBetweenScans); + const changesPerDay = formatToTwoDecimal(changeCount / daysBetweenScans); + return { + daysBetweenScans, + daysBetweenChanges, + scansPerDay, + changesPerDay, + }; +}; + +const getWaybackData = async (url) => { + const cdxUrl = `https://web.archive.org/cdx/search/cdx?url=${url}&output=json&fl=timestamp,statuscode,digest,length,offset`; + + try { + const { data } = await axios.get(cdxUrl); + + // Check there's data + if (!data || !Array.isArray(data) || data.length <= 1) { + return { skipped: 'Site has never before been archived via the Wayback Machine' }; + } + + // Remove the header row + data.shift(); + + // Process and return the results + const firstScan = convertTimestampToDate(data[0][0]); + const lastScan = convertTimestampToDate(data[data.length - 1][0]); + const totalScans = data.length; + const changeCount = countPageChanges(data); + return { + firstScan, + lastScan, + totalScans, + changeCount, + averagePageSize: getAveragePageSize(data), + scanFrequency: getScanFrequency(firstScan, lastScan, totalScans, changeCount), + scans: data, + scanUrl: url, + }; + } catch (err) { + return { error: `Error fetching Wayback data: ${err.message}` }; + } +}; + +exports.handler = middleware(getWaybackData);