Find, parse and render info from Sitemap

This commit is contained in:
Alicia Sykes 2023-07-21 20:53:54 +01:00
parent fc030ffcd6
commit 7ebe96b9be
7 changed files with 138 additions and 47 deletions

View File

@ -124,6 +124,11 @@
to = "/.netlify/functions/tech-stack"
status = 301
force = true
[[redirects]]
from = "/sitemap"
to = "/.netlify/functions/sitemap"
status = 301
force = true
# For router history mode, ensure pages land on index
[[redirects]]

View File

@ -33,7 +33,8 @@
"tsparticles": "^2.0.6",
"typescript": "^4.7.3",
"wappalyzer": "^6.10.63",
"web-vitals": "^2.1.4"
"web-vitals": "^2.1.4",
"xml2js": "^0.6.0"
},
"scripts": {
"start": "react-scripts start",

41
server/lambda/sitemap.js Normal file
View File

@ -0,0 +1,41 @@
const axios = require('axios');
const xml2js = require('xml2js');
exports.handler = async (event) => {
const baseUrl = event.queryStringParameters.url.replace(/^(?:https?:\/\/)?/i, "");
const url = baseUrl.startsWith('http') ? baseUrl : `http://${baseUrl}`;
let sitemapUrl;
try {
// Fetch robots.txt
const robotsRes = await axios.get(`${url}/robots.txt`);
const robotsTxt = robotsRes.data.split('\n');
for (let line of robotsTxt) {
if (line.startsWith('Sitemap:')) {
sitemapUrl = line.split(' ')[1];
}
}
if (!sitemapUrl) {
return {
statusCode: 404,
body: JSON.stringify({ error: 'Sitemap not found in robots.txt' }),
};
}
// Fetch sitemap
const sitemapRes = await axios.get(sitemapUrl);
const sitemap = await xml2js.parseStringPromise(sitemapRes.data);
return {
statusCode: 200,
body: JSON.stringify(sitemap),
};
} catch (error) {
return {
statusCode: 500,
body: JSON.stringify({ error: error.message }),
};
}
};

View File

@ -0,0 +1,58 @@
import { Card } from 'components/Form/Card';
import Heading from 'components/Form/Heading';
import Row, { ExpandableRow } from 'components/Form/Row';
import colors from 'styles/colors';
const cardStyles = `
max-height: 50rem;
overflow-y: auto;
a {
color: ${colors.primary};
}
small {
margin-top: 1rem;
opacity: 0.5;
display: block;
a { color: ${colors.primary}; }
}
`;
const SitemapCard = (props: {data: any, title: string, actionButtons: any }): JSX.Element => {
console.log(props.data);
const normalSiteMap = props.data.url || props.data.urlset?.url || null;
const siteMapIndex = props.data.sitemapindex?.sitemap || null;
const makeExpandableRowData = (site: any) => {
const results = [];
if (site.lastmod) { results.push({lbl: 'Last Modified', val: site.lastmod[0]}); }
if (site.changefreq) { results.push({lbl: 'Change Frequency', val: site.changefreq[0]}); }
if (site.priority) { results.push({lbl: 'Priority', val: site.priority[0]}); }
return results;
};
const getPathFromUrl = (url: string) => {
const urlObj = new URL(url);
return urlObj.pathname;
};
return (
<Card heading={props.title} actionButtons={props.actionButtons} styles={cardStyles}>
{
normalSiteMap && normalSiteMap.map((subpage: any, index: number) => {
return (<ExpandableRow lbl={getPathFromUrl(subpage.loc[0])} val="" rowList={makeExpandableRowData(subpage)}></ExpandableRow>)
})
}
{ siteMapIndex && <p>
This site returns a sitemap index, which is a list of sitemaps.
</p>}
{
siteMapIndex && siteMapIndex.map((subpage: any, index: number) => {
return (<Row lbl="" val=""><a href={subpage.loc[0]}>{getPathFromUrl(subpage.loc[0])}</a></Row>);
})
}
</Card>
);
}
export default SitemapCard;

View File

@ -190,7 +190,7 @@ const jobNames = [
'domain-lookup',
'tech-stack',
'hosts',
'lighthouse',
'quality',
'cookies',
'server-info',
'redirects',
@ -200,8 +200,9 @@ const jobNames = [
'ports',
'screenshot',
'txt-records',
'sitemap',
'hsts',
'whois',
// 'whois',
'features',
'carbon',
'trace-route',

View File

@ -35,6 +35,7 @@ import CarbonFootprintCard from 'components/Results/CarbonFootprint';
import SiteFeaturesCard from 'components/Results/SiteFeatures';
import DnsSecCard from 'components/Results/DnsSec';
import HstsCard from 'components/Results/Hsts';
import SitemapCard from 'components/Results/Sitemap';
import DomainLookup from 'components/Results/DomainLookup';
import DnsServerCard from 'components/Results/DnsServer';
import TechStackCard from 'components/Results/TechStack';
@ -60,24 +61,11 @@ import {
const ResultsOuter = styled.div`
display: flex;
flex-direction: column;
.my-masonry-grid {
display: -webkit-box; /* Not needed if autoprefixing */
display: -ms-flexbox; /* Not needed if autoprefixing */
.masonry-grid {
display: flex;
// margin: 1rem;
// margin-left: -30px; /* gutter size offset */
width: auto;
}
.my-masonry-grid_column {
// margin-left: 30px; /* gutter size */
background-clip: padding-box;
}
/* Style your items */
.my-masonry-grid_column > div { /* change div to reference your elements you put in <Masonry> */
// background: grey;
// margin-bottom: 30px;
}
.masonry-grid-col section { margin: 1rem 0.5rem; }
`;
const ResultsContent = styled.section`
@ -165,7 +153,6 @@ const Results = (): JSX.Element => {
}, []);
const parseJson = (response: Response): Promise<any> => {
// return response.json()
return new Promise((resolve) => {
if (response.ok) {
response.json()
@ -181,20 +168,6 @@ const Results = (): JSX.Element => {
}
});
};
// const parseJson = (response: Response): Promise<any> => {
// if (response.status >= 400) {
// return new Promise((resolve) => resolve({ error: `Failed to fetch data: ${response.statusText}` }));
// }
// return new Promise((resolve) => {
// if (!response) { resolve({ error: 'No response from server' }); }
// response.json()
// .catch(error => resolve({ error: `Failed to process response, likely due to Netlify's 10-sec limit on lambda functions. Error: ${error}`}));
// });
// };
useEffect(() => {
if (!addressType || addressType === 'empt') {
@ -312,16 +285,6 @@ const Results = (): JSX.Element => {
.then(res => applyWhoIsResults(res)),
});
// Fetch and parse built-with results
// const [technologyResults, updateTechnologyResults] = useMotherHook<TechnologyGroup[]>({
// jobId: 'built-with',
// updateLoadingJobs,
// addressInfo: { address, addressType, expectedAddressTypes: urlTypeOnly },
// fetchRequest: () => fetch(`https://api.builtwith.com/v21/api.json?KEY=${keys.builtWith}&LOOKUP=${address}`)
// .then(res => parseJson(res))
// .then(res => makeTechnologies(res)),
// });
// Fetches DNS TXT records
const [txtRecordResults, updateTxtRecordResults] = useMotherHook({
jobId: 'txt-records',
@ -378,6 +341,14 @@ const Results = (): JSX.Element => {
fetchRequest: () => fetch(`/check-hsts?url=${address}`).then(res => parseJson(res)),
});
// Get a websites listed pages, from sitemap
const [sitemapResults, updateSitemapResults] = useMotherHook({
jobId: 'sitemap',
updateLoadingJobs,
addressInfo: { address, addressType, expectedAddressTypes: urlTypeOnly },
fetchRequest: () => fetch(`/sitemap?url=${address}`).then(res => parseJson(res)),
});
// Get site features from BuiltWith
const [siteFeaturesResults, updateSiteFeaturesResults] = useMotherHook({
jobId: 'features',
@ -449,12 +420,13 @@ const Results = (): JSX.Element => {
{ id: 'dns', title: 'DNS Records', result: dnsResults, Component: DnsRecordsCard, refresh: updateDnsResults },
{ id: 'hosts', title: 'Host Names', result: shoadnResults?.hostnames, Component: HostNamesCard, refresh: updateShodanResults },
{ id: 'tech-stack', title: 'Tech Stack', result: techStackResults, Component: TechStackCard, refresh: updateTechStackResults },
{ id: 'lighthouse', title: 'Performance', result: lighthouseResults, Component: LighthouseCard, refresh: updateLighthouseResults },
{ id: 'quality', title: 'Quality Summary', result: lighthouseResults, Component: LighthouseCard, refresh: updateLighthouseResults },
{ id: 'cookies', title: 'Cookies', result: cookieResults, Component: CookiesCard, refresh: updateCookieResults },
{ id: 'trace-route', title: 'Trace Route', result: traceRouteResults, Component: TraceRouteCard, refresh: updateTraceRouteResults },
{ id: 'server-info', title: 'Server Info', result: shoadnResults?.serverInfo, Component: ServerInfoCard, refresh: updateShodanResults },
{ id: 'redirects', title: 'Redirects', result: redirectResults, Component: RedirectsCard, refresh: updateRedirectResults },
{ id: 'robots-txt', title: 'Crawl Rules', result: robotsTxtResults, Component: RobotsTxtCard, refresh: updateRobotsTxtResults },
{ id: 'sitemap', title: 'Pages', result: sitemapResults, Component: SitemapCard, refresh: updateSitemapResults },
{ id: 'dnssec', title: 'DNSSEC', result: dnsSecResults, Component: DnsSecCard, refresh: updateDnsSecResults },
{ id: 'status', title: 'Server Status', result: serverStatusResults, Component: ServerStatusCard, refresh: updateServerStatusResults },
{ id: 'ports', title: 'Open Ports', result: portsResults, Component: OpenPortsCard, refresh: updatePortsResults },
@ -526,8 +498,8 @@ const Results = (): JSX.Element => {
<Masonry
breakpointCols={{ 10000: 12, 4000: 9, 3600: 8, 3200: 7, 2800: 6, 2400: 5, 2000: 4, 1600: 3, 1200: 2, 800: 1 }}
className="my-masonry-grid"
columnClassName="my-masonry-grid_column">
className="masonry-grid"
columnClassName="masonry-grid-col">
{
resultCardData.map(({ id, title, result, refresh, Component }, index: number) => (
(result && !result.error) ? (

View File

@ -8699,7 +8699,7 @@ sass-loader@^12.3.0:
klona "^2.0.4"
neo-async "^2.6.2"
sax@~1.2.4:
sax@>=0.6.0, sax@~1.2.4:
version "1.2.4"
resolved "https://registry.yarnpkg.com/sax/-/sax-1.2.4.tgz#2816234e2378bddc4e5354fab5caa895df7100d9"
integrity sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==
@ -10552,6 +10552,19 @@ xml-name-validator@^3.0.0:
resolved "https://registry.yarnpkg.com/xml-name-validator/-/xml-name-validator-3.0.0.tgz#6ae73e06de4d8c6e47f9fb181f78d648ad457c6a"
integrity sha512-A5CUptxDsvxKJEU3yO6DuWBSJz/qizqzJKOMIfUJHETbBw/sFaDxgd6fxm1ewUaM0jZ444Fc5vC5ROYurg/4Pw==
xml2js@^0.6.0:
version "0.6.0"
resolved "https://registry.yarnpkg.com/xml2js/-/xml2js-0.6.0.tgz#07afc447a97d2bd6507a1f76eeadddb09f7a8282"
integrity sha512-eLTh0kA8uHceqesPqSE+VvO1CDDJWMwlQfB6LuN6T8w6MaDJ8Txm8P7s5cHD0miF0V+GGTZrDQfxPZQVsur33w==
dependencies:
sax ">=0.6.0"
xmlbuilder "~11.0.0"
xmlbuilder@~11.0.0:
version "11.0.1"
resolved "https://registry.yarnpkg.com/xmlbuilder/-/xmlbuilder-11.0.1.tgz#be9bae1c8a046e76b31127726347d0ad7002beb3"
integrity sha512-fDlsI/kFEx7gLvbecc0/ohLG50fugQp8ryHzMTuW9vSa1GJ0XYWKnhsUx7oie3G98+r56aTQIUB4kht42R3JvA==
xmlchars@^2.2.0:
version "2.2.0"
resolved "https://registry.yarnpkg.com/xmlchars/-/xmlchars-2.2.0.tgz#060fe1bcb7f9c76fe2a17db86a9bc3ab894210cb"