Skip to content

Commit

Permalink
perf(core): optimize broken links checker (#9778)
Browse files Browse the repository at this point in the history
  • Loading branch information
slorber authored Jan 24, 2024
1 parent 8597e5d commit c827b6d
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 66 deletions.
57 changes: 57 additions & 0 deletions packages/docusaurus/src/server/__tests__/brokenLinks.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
*/

import {jest} from '@jest/globals';
import reactRouterConfig from 'react-router-config';
import {handleBrokenLinks} from '../brokenLinks';
import type {RouteConfig} from '@docusaurus/types';

Expand Down Expand Up @@ -718,4 +719,60 @@ describe('handleBrokenLinks', () => {
"
`);
});

it('is performant and minimize calls to matchRoutes', async () => {
const matchRoutesMock = jest.spyOn(reactRouterConfig, 'matchRoutes');

const scale = 100;

const routes: SimpleRoute[] = [
...Array.from<SimpleRoute>({length: scale}).map((_, i) => ({
path: `/page${i}`,
})),
...Array.from<SimpleRoute>({length: scale}).fill({
path: '/pageDynamic/:subpath1',
}),
];

const collectedLinks: Params['collectedLinks'] = Object.fromEntries(
Array.from<SimpleRoute>({length: scale}).map((_, i) => [
`/page${i}`,
{
links: [
...Array.from<SimpleRoute>({length: scale}).flatMap((_2, j) => [
`/page${j}`,
`/page${j}?age=42`,
`/page${j}#anchor${j}`,
`/page${j}?age=42#anchor${j}`,
`/pageDynamic/subPath${j}`,
`/pageDynamic/subPath${j}?age=42`,
// `/pageDynamic/subPath${j}#anchor${j}`,
// `/pageDynamic/subPath${j}?age=42#anchor${j}`,
]),
],
anchors: Array.from<SimpleRoute>({length: scale}).map(
(_2, j) => `anchor${j}`,
),
},
]),
);

// console.time('testBrokenLinks');
await testBrokenLinks({
routes,
collectedLinks,
});
// console.timeEnd('testBrokenLinks');

// Idiomatic code calling matchRoutes multiple times is not performant
// We try to minimize the calls to this expensive function
// Otherwise large sites will have super long execution times
// See https://github.com/facebook/docusaurus/issues/9754
// See https://twitter.com/sebastienlorber/status/1749392773415858587
// We expect no more matchRoutes calls than number of dynamic route links
expect(matchRoutesMock).toHaveBeenCalledTimes(scale);
// We expect matchRoutes to be called with a reduced number of routes
expect(routes).toHaveLength(scale * 2);
expect(matchRoutesMock.mock.calls[0]![0]).toHaveLength(scale);
});
});
183 changes: 117 additions & 66 deletions packages/docusaurus/src/server/brokenLinks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,18 @@

import _ from 'lodash';
import logger from '@docusaurus/logger';
import {matchRoutes} from 'react-router-config';
import {matchRoutes as reactRouterMatchRoutes} from 'react-router-config';
import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
import {getAllFinalRoutes} from './utils';
import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';

function matchRoutes(routeConfig: RouteConfig[], pathname: string) {
// @ts-expect-error: React router types RouteConfig with an actual React
// component, but we load route components with string paths.
// We don't actually access component here, so it's fine.
return reactRouterMatchRoutes(routeConfig, pathname);
}

type BrokenLink = {
link: string;
resolvedLink: string;
Expand All @@ -26,88 +33,121 @@ type CollectedLinks = {
[pathname: string]: {links: string[]; anchors: string[]};
};

function getBrokenLinksForPage({
// We use efficient data structures for performance reasons
// See https://github.com/facebook/docusaurus/issues/9754
type CollectedLinksNormalized = Map<
string,
{links: Set<string>; anchors: Set<string>}
>;

type BrokenLinksHelper = {
collectedLinks: CollectedLinksNormalized;
isPathBrokenLink: (linkPath: URLPath) => boolean;
isAnchorBrokenLink: (linkPath: URLPath) => boolean;
};

function createBrokenLinksHelper({
collectedLinks,
pagePath,
pageLinks,
routes,
}: {
collectedLinks: CollectedLinks;
pagePath: string;
pageLinks: string[];
pageAnchors: string[];
collectedLinks: CollectedLinksNormalized;
routes: RouteConfig[];
}): BrokenLink[] {
const allCollectedPaths = new Set(Object.keys(collectedLinks));
}): BrokenLinksHelper {
const validPathnames = new Set(collectedLinks.keys());

// Matching against the route array can be expensive
// If the route is already in the valid pathnames,
// we can avoid matching against it as an optimization
const remainingRoutes = routes.filter(
(route) => !validPathnames.has(route.path),
);

function isPathnameMatchingAnyRoute(pathname: string): boolean {
if (matchRoutes(remainingRoutes, pathname).length > 0) {
// IMPORTANT: this is an optimization here
// See https://github.com/facebook/docusaurus/issues/9754
// Large Docusaurus sites have many routes!
// We try to minimize calls to a possibly expensive matchRoutes function
validPathnames.add(pathname);
return true;
}

return false;
}

function isPathBrokenLink(linkPath: URLPath) {
const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
const matchedRoutes = pathnames
// @ts-expect-error: React router types RouteConfig with an actual React
// component, but we load route components with string paths.
// We don't actually access component here, so it's fine.
.map((l) => matchRoutes(routes, l))
.flat();
// The link path is broken if:
// - it doesn't match any route
// - it doesn't match any collected path
return (
matchedRoutes.length === 0 &&
!pathnames.some((p) => allCollectedPaths.has(p))
);
if (pathnames.some((p) => validPathnames.has(p))) {
return false;
}
if (pathnames.some(isPathnameMatchingAnyRoute)) {
return false;
}
return true;
}

function isAnchorBrokenLink(linkPath: URLPath) {
const {pathname, hash} = linkPath;

// Link has no hash: it can't be a broken anchor link
if (hash === undefined) {
return false;
}

// Link has empty hash ("#", "/page#"...): we do not report it as broken
// Empty hashes are used for various weird reasons, by us and other users...
// See for example: https://github.com/facebook/docusaurus/pull/6003
if (hash === '') {
return false;
}

const targetPage =
collectedLinks[pathname] || collectedLinks[decodeURI(pathname)];

collectedLinks.get(pathname) || collectedLinks.get(decodeURI(pathname));
// link with anchor to a page that does not exist (or did not collect any
// link/anchor) is considered as a broken anchor
if (!targetPage) {
return true;
}

// it's a broken anchor if the target page exists
// but the anchor does not exist on that page
const hashes = [hash, decodeURIComponent(hash)];
return !targetPage.anchors.some((anchor) => hashes.includes(anchor));
// it's a not broken anchor if the anchor exists on the target page
if (
targetPage.anchors.has(hash) ||
targetPage.anchors.has(decodeURIComponent(hash))
) {
return false;
}
return true;
}

const brokenLinks = pageLinks.flatMap((link) => {
return {
collectedLinks,
isPathBrokenLink,
isAnchorBrokenLink,
};
}

function getBrokenLinksForPage({
pagePath,
helper,
}: {
pagePath: string;
helper: BrokenLinksHelper;
}): BrokenLink[] {
const pageData = helper.collectedLinks.get(pagePath)!;

const brokenLinks: BrokenLink[] = [];

pageData.links.forEach((link) => {
const linkPath = parseURLPath(link, pagePath);
if (isPathBrokenLink(linkPath)) {
return [
{
link,
resolvedLink: serializeURLPath(linkPath),
anchor: false,
},
];
}
if (isAnchorBrokenLink(linkPath)) {
return [
{
link,
resolvedLink: serializeURLPath(linkPath),
anchor: true,
},
];
if (helper.isPathBrokenLink(linkPath)) {
brokenLinks.push({
link,
resolvedLink: serializeURLPath(linkPath),
anchor: false,
});
} else if (helper.isAnchorBrokenLink(linkPath)) {
brokenLinks.push({
link,
resolvedLink: serializeURLPath(linkPath),
anchor: true,
});
}
return [];
});

return brokenLinks;
Expand All @@ -128,26 +168,30 @@ function getBrokenLinks({
collectedLinks,
routes,
}: {
collectedLinks: CollectedLinks;
collectedLinks: CollectedLinksNormalized;
routes: RouteConfig[];
}): BrokenLinksMap {
const filteredRoutes = filterIntermediateRoutes(routes);

return _.mapValues(collectedLinks, (pageCollectedData, pagePath) => {
const helper = createBrokenLinksHelper({
collectedLinks,
routes: filteredRoutes,
});

const result: BrokenLinksMap = {};
collectedLinks.forEach((_unused, pagePath) => {
try {
return getBrokenLinksForPage({
collectedLinks,
pageLinks: pageCollectedData.links,
pageAnchors: pageCollectedData.anchors,
result[pagePath] = getBrokenLinksForPage({
pagePath,
routes: filteredRoutes,
helper,
});
} catch (e) {
throw new Error(`Unable to get broken links for page ${pagePath}.`, {
cause: e,
});
}
});
return result;
}

function brokenLinkMessage(brokenLink: BrokenLink): string {
Expand Down Expand Up @@ -303,15 +347,22 @@ function reportBrokenLinks({
// JS users might call "collectLink(undefined)" for example
// TS users might call "collectAnchor('#hash')" with/without #
// We clean/normalize the collected data to avoid obscure errors being thrown
// We also use optimized data structures for a faster algorithm
function normalizeCollectedLinks(
collectedLinks: CollectedLinks,
): CollectedLinks {
return _.mapValues(collectedLinks, (pageCollectedData) => ({
links: pageCollectedData.links.filter(_.isString),
anchors: pageCollectedData.anchors
.filter(_.isString)
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
}));
): CollectedLinksNormalized {
const result: CollectedLinksNormalized = new Map();
Object.entries(collectedLinks).forEach(([pathname, pageCollectedData]) => {
result.set(pathname, {
links: new Set(pageCollectedData.links.filter(_.isString)),
anchors: new Set(
pageCollectedData.anchors
.filter(_.isString)
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
),
});
});
return result;
}

export async function handleBrokenLinks({
Expand Down

0 comments on commit c827b6d

Please sign in to comment.