mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-22 15:53:34 +01:00
55ae236928
- fix tests - still run header detection based on heights even if TOC headlines have been identified
256 lines
8.7 KiB
TypeScript
256 lines
8.7 KiB
TypeScript
import { toMatchFile } from 'jest-file-snapshot';
|
|
|
|
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import * as http from 'http';
|
|
import * as https from 'https';
|
|
|
|
import PdfParser from 'src/PdfParser';
|
|
import PdfPipeline from 'src/PdfPipeline';
|
|
import { transformers } from 'src/index';
|
|
import Debugger from 'src/Debugger';
|
|
import Globals from 'src/Globals';
|
|
import Item from 'src/Item';
|
|
import { Change } from 'src/debug/ChangeIndex';
|
|
import StageResult from 'src/debug/StageResult';
|
|
import EvaluationIndex from 'src/debug/EvaluationIndex';
|
|
import RemoveRepetitiveItems from 'src/transformer/RemoveRepetitiveItems';
|
|
import DetectToc, { TOC_GLOBAL } from 'src/transformer/DetectToc';
|
|
import DetectHeaders from 'src/transformer/DetectHeaders';
|
|
import TOC from 'src/TOC';
|
|
import { getText } from 'src/support/items';
|
|
|
|
pdfjs.GlobalWorkerOptions.workerSrc = `pdfjs-dist/es5/build/pdf.worker.min.js`;
|
|
|
|
const parser = new PdfParser(pdfjs);
|
|
const pipeline = new PdfPipeline(parser, transformers);
|
|
|
|
const folder = './examples';
|
|
const files = fs.readdirSync(folder).filter((file) => file.endsWith('.pdf'));
|
|
const urls = [
|
|
'https://homepages.cwi.nl/~lex/files/dict.pdf',
|
|
'https://github.com/mozilla/pdf.js/raw/master/web/compressed.tracemonkey-pldi-09.pdf',
|
|
];
|
|
const downloadCache = 'node_modules/.cache/files';
|
|
|
|
expect.extend({ toMatchFile });
|
|
|
|
// Test is for debugging purpose
|
|
test('Debug', async () => {
|
|
// const data = fs.readFileSync(`${folder}/Adventures-Of-Sherlock-Holmes.pdf`);
|
|
const data = fs.readFileSync(`${folder}/ExamplePdf.pdf`);
|
|
await pipeline.parse(data, () => {});
|
|
});
|
|
|
|
describe.each(files)('Test %p', (file) => {
|
|
const data = fs.readFileSync(`${folder}/${file}`);
|
|
|
|
let debug: Debugger;
|
|
const printedGlobals = new Set<string>();
|
|
beforeAll(async () => {
|
|
debug = await pipeline.parse(data, () => {}).then((pc) => pc.debug());
|
|
});
|
|
|
|
test.each(transformers.map((t) => t.name).filter((name) => name !== 'Does nothing'))(
|
|
'stage %p',
|
|
(transformerName) => {
|
|
const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));
|
|
const chunkedLines: string[][] = [[]];
|
|
let resultIndex = 0;
|
|
let collectedItems = 0;
|
|
stageResult.selectPages(true, true).forEach((page) => {
|
|
page.itemGroups.forEach((itemGroup) => {
|
|
const item = itemGroup.top;
|
|
const change = stageResult.changes.change(item);
|
|
if (change || stageResult.descriptor.debug?.showAll) {
|
|
chunkedLines[resultIndex].push(itemToString(debug.fontMap, item, change, stageResult.evaluations));
|
|
collectedItems++;
|
|
}
|
|
});
|
|
|
|
// we split results to multiple files to circumvent githubs 100MB limit
|
|
resultIndex = Math.floor(collectedItems / 450_000);
|
|
if (resultIndex === chunkedLines.length) {
|
|
chunkedLines.push([]);
|
|
}
|
|
});
|
|
|
|
// Global characteristics
|
|
chunkedLines[0].unshift(toHeader(stageResult, printedGlobals));
|
|
try {
|
|
chunkedLines.forEach((lines, idx) => {
|
|
const transformerResultAsString = lines.join('\n') || '{}';
|
|
expect(transformerResultAsString).toMatchFile(matchFilePath(file, transformerName, chunkedLines.length, idx));
|
|
});
|
|
} finally {
|
|
stageResult.globals.keys().forEach((globalKey) => {
|
|
printedGlobals.add(globalKey);
|
|
});
|
|
}
|
|
},
|
|
);
|
|
});
|
|
|
|
function matchFilePath(pdfFileName: string, transformerName: string, chunkCount = 1, chunkIndex = 0): string {
|
|
const pdfFileNameWithoutExtension = pdfFileName.substr(0, pdfFileName.length - 4);
|
|
const resultFileName = `${transformerName[0].toLowerCase() + transformerName.slice(1).replace(/\s/g, '')}`;
|
|
const fileIndex = chunkCount > 1 ? `.${chunkIndex}` : '';
|
|
return `${folder}/${pdfFileNameWithoutExtension}/${resultFileName}${fileIndex}.json`;
|
|
}
|
|
|
|
describe('Selective transforms on URL PDFs', () => {
|
|
const transformerNames = [new RemoveRepetitiveItems().name, new DetectToc().name, new DetectHeaders().name];
|
|
test.each(urls)('URL %p', async (url) => {
|
|
const { fileName, data } = download(url);
|
|
const debug = await pipeline.parse(data, () => {}).then((pc) => pc.debug());
|
|
const printedGlobals = new Set<string>();
|
|
|
|
transformerNames.forEach((transformerName) => {
|
|
const stageResult = debug.stageResult(debug.stageNames.indexOf(transformerName));
|
|
const pages = stageResult.selectPages(true, true);
|
|
|
|
const lines: string[] = [];
|
|
lines.push(toHeader(stageResult, printedGlobals));
|
|
|
|
pages.forEach((page) =>
|
|
page.itemGroups.forEach((itemGroup) => {
|
|
const item = itemGroup.top;
|
|
const change = stageResult.changes.change(item);
|
|
if (change) {
|
|
lines.push(itemToString(debug.fontMap, item, change, stageResult.evaluations));
|
|
}
|
|
}),
|
|
);
|
|
|
|
const transformerResultAsString = lines.join('\n') || '{}';
|
|
expect(transformerResultAsString).toMatchFile(matchFilePath(fileName, transformerName));
|
|
|
|
stageResult.globals.keys().forEach((globalKey) => {
|
|
printedGlobals.add(globalKey);
|
|
});
|
|
});
|
|
});
|
|
});
|
|
|
|
function toHeader(stageResult: StageResult, alreadyPrintedGlobals: Set<string>): string {
|
|
const groupedItemCount = stageResult
|
|
.selectPages(false, true)
|
|
.reduce((itemCount, page) => itemCount + page.itemGroups.length, 0);
|
|
|
|
return JSON.stringify(
|
|
{
|
|
pages: stageResult.pages.length,
|
|
items: stageResult.itemsUnpacked().length,
|
|
groupedItems: groupedItemCount,
|
|
changes: stageResult.changes.changeCount(),
|
|
schema: stageResult.schema,
|
|
globals: globalsToString(stageResult.globals, alreadyPrintedGlobals),
|
|
// messages: stageResults.messages,
|
|
},
|
|
null,
|
|
2,
|
|
);
|
|
}
|
|
|
|
function globalsToString(globals: Globals, alreadyPrintedGlobals: Set<string>): object {
|
|
return Array.from(globals.map)
|
|
.filter(([key]) => !alreadyPrintedGlobals.has(key))
|
|
.reduce((obj, [key, value]) => {
|
|
if (key === TOC_GLOBAL.key) {
|
|
const toc: TOC = value as TOC;
|
|
value = {
|
|
...toc,
|
|
tocHeadlineItems: toc.tocHeadlineItems.map((item) => ({
|
|
page: item.page,
|
|
text: getText(item),
|
|
})),
|
|
};
|
|
}
|
|
obj[key] = value;
|
|
return obj;
|
|
}, {});
|
|
}
|
|
|
|
function itemToString(
|
|
fontMap: Map<string, object>,
|
|
item: Item,
|
|
change: Change | undefined,
|
|
evaluationIndex: EvaluationIndex,
|
|
): string {
|
|
const fontName: string | Array<string> = item.data['fontName'];
|
|
let newFontName: string | Array<string> | undefined = undefined;
|
|
if (fontName) {
|
|
if (typeof fontName === 'string') {
|
|
newFontName = fontMap.get(fontName)?.['name'];
|
|
} else {
|
|
newFontName = fontName.map((name) => fontMap.get(name)?.['name']);
|
|
}
|
|
}
|
|
const changeType = change?.constructor.name || 'none';
|
|
const transform: undefined | number[] = item.data['transform'];
|
|
let newTransform: undefined | string[];
|
|
if (transform) {
|
|
newTransform = transform.map((num) => num.toFixed(2));
|
|
}
|
|
|
|
const object = {
|
|
page: item.page,
|
|
change: changeType,
|
|
score: evaluationIndex.evaluationScore(item),
|
|
types: item.data['types'],
|
|
...item.data,
|
|
fontName: newFontName,
|
|
height: item.data['height'].toFixed(2),
|
|
width: item.data['width'].toFixed(2),
|
|
transform: newTransform,
|
|
};
|
|
if (!evaluationIndex.hasScores()) {
|
|
delete object.score;
|
|
}
|
|
return JSON.stringify(object);
|
|
}
|
|
|
|
function download(url: string): { fileName: string; data: Buffer } {
|
|
const fileName = path.basename(new URL(url).pathname);
|
|
const localFilePath = `${downloadCache}/${fileName}`;
|
|
if (!fs.existsSync(localFilePath)) {
|
|
fs.mkdirSync(downloadCache, { recursive: true });
|
|
downloadToFile(url, localFilePath);
|
|
}
|
|
return {
|
|
fileName,
|
|
data: fs.readFileSync(localFilePath),
|
|
};
|
|
}
|
|
|
|
function downloadToFile(url: string, dest: string): Promise<void> {
|
|
const uri = new URL(url);
|
|
const pkg = url.toLowerCase().startsWith('https:') ? https : http;
|
|
|
|
return new Promise((resolve, reject) => {
|
|
pkg.get(uri.href).on('response', (res) => {
|
|
if (res.statusCode === 200) {
|
|
const file = fs.createWriteStream(dest, { flags: 'wx' });
|
|
res
|
|
.on('end', () => {
|
|
file.end();
|
|
resolve();
|
|
})
|
|
.on('error', (err) => {
|
|
file.destroy();
|
|
fs.unlink(dest, () => reject(err));
|
|
})
|
|
.pipe(file);
|
|
} else if (res.statusCode === 302 || res.statusCode === 301) {
|
|
// Recursively follow redirects, only a 200 will resolve.
|
|
if (res.headers.location) {
|
|
downloadToFile(res.headers.location, dest).then(() => resolve());
|
|
}
|
|
} else {
|
|
reject(new Error(`Download request failed, response status: ${res.statusCode} ${res.statusMessage}`));
|
|
}
|
|
});
|
|
});
|
|
}
|