pdf-to-markdown/test/PdfParser.test.ts
2024-03-26 10:52:54 -06:00

190 lines
6.2 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
import * as fs from 'fs';
import PdfParser from 'src/PdfParser';
import ParseProgressReporter from 'src/ParseProgressReporter';
import Progress from 'src/Progress';
import Item from 'src/Item';
const parser = new PdfParser(pdfjs);
test('basic example PDF parse', async () => {
const progressUpdates: Progress[] = [];
const data = fs.readFileSync('./examples/ExamplePdf.pdf', null);
// to test
const result = await parser.parse(
data,
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
);
// verify pages
const expectedPages = 7;
expect(result.metadata.title()).toEqual('ExamplePdf');
expect(result.metadata.author()).toEqual('Johannes Zillmann');
expect(result.pageCount).toBe(expectedPages);
result.pdfjsPages.forEach((pdfPage, i) => {
expect(pdfPage._pageIndex).toBe(i);
});
expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
// verify first n items
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
new Item(0, {
str: 'Mega Überschrift',
dir: 'ltr',
width: 245.05800000000005,
height: 30,
transform: [30, 0, 0, 30, 175, 756],
fontName: 'g_d0_f1',
}).withoutUuid(),
new Item(0, {
str: '2te Überschrift',
dir: 'ltr',
width: 130.056,
height: 20,
transform: [20, 0, 0, 20, 233, 665],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: 'Dies ist eine Test-PDF',
dir: 'ltr',
width: 108.61950000000003,
height: 11,
transform: [11, 0, 0, 11, 240, 585],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: '.',
dir: 'ltr',
width: 3.0580000000000003,
height: 11,
transform: [11, 0, 0, 11, 352.6927, 585],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: '1',
dir: 'ltr',
width: 4.077333704,
height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 348, 588],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: 'Fürs Testen des ',
dir: 'ltr',
width: 83.7826,
height: 11,
transform: [11, 0, 0, 11, 208, 572],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: 'Markdown Parsers',
dir: 'ltr',
width: 91.6982,
height: 11,
transform: [11, 0, 0, 11, 291.77832, 572],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: '.',
dir: 'ltr',
width: 3.0580000000000003,
height: 11,
transform: [11, 0, 0, 11, 383.47360000000003, 572],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: ' ',
dir: 'ltr',
width: 3.0580000000000003,
height: 11,
transform: [11, 0, 0, 11, 61.078451, 59],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: 'In Deutsch.',
dir: 'ltr',
width: 55.64240000000001,
height: 11,
transform: [11, 0, 0, 11, 64.134603, 59],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: '1',
dir: 'ltr',
width: 4.077333704,
height: 7.333334,
transform: [7.333334, 0, 0, 7.333334, 57, 62],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(0, {
str: '\x00',
dir: 'ltr',
width: 0,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f3',
}).withoutUuid(),
new Item(0, {
str: '1',
dir: 'ltr',
width: 6.672000000000001,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(1, {
str: '\x00',
dir: 'ltr',
width: 0,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f3',
}).withoutUuid(),
new Item(1, {
str: '2',
dir: 'ltr',
width: 6.672000000000001,
height: 12,
transform: [12, 0, 0, 12, 294, 45],
fontName: 'g_d0_f2',
}).withoutUuid(),
new Item(2, {
str: 'Paragraphen',
dir: 'ltr',
width: 110.04479999999998,
height: 18,
transform: [18, 0, 0, 18, 57, 767],
fontName: 'g_d0_f1',
}).withoutUuid(),
]);
// verify progress
expect(progressUpdates.length).toBe(expectedPages + 3);
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
expect(progressUpdates[1].stageProgress).toEqual([1, 1, 0, 0]);
expect(progressUpdates[1].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
expect(progressUpdates[2].stageProgress).toEqual([1, 1, 1 / expectedPages, 0]);
expect(progressUpdates[2].stageDetails).toEqual([null, null, `1 / ${expectedPages}`, null]);
expect(progressUpdates[3].stageProgress).toEqual([1, 1, 2 / expectedPages, 0]);
expect(progressUpdates[3].stageDetails).toEqual([null, null, `2 / ${expectedPages}`, null]);
expect(progressUpdates[4].stageProgress).toEqual([1, 1, 3 / expectedPages, 0]);
expect(progressUpdates[4].stageDetails).toEqual([null, null, `3 / ${expectedPages}`, null]);
expect(progressUpdates[5].stageProgress).toEqual([1, 1, 4 / expectedPages, 0]);
expect(progressUpdates[5].stageDetails).toEqual([null, null, `4 / ${expectedPages}`, null]);
expect(progressUpdates[6].stageProgress).toEqual([1, 1, 5 / expectedPages, 0]);
expect(progressUpdates[6].stageDetails).toEqual([null, null, `5 / ${expectedPages}`, null]);
expect(progressUpdates[7].stageProgress).toEqual([1, 1, 6 / expectedPages, 0]);
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]);
expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
});