mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-08 14:49:20 +01:00
189 lines
6.2 KiB
TypeScript
189 lines
6.2 KiB
TypeScript
import PdfParser from 'src/PdfParser';
|
||
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
||
import * as fs from 'fs';
|
||
import ParseProgressReporter from 'src/ParseProgressReporter';
|
||
import Progress from 'src/Progress';
|
||
import Item from 'src/Item';
|
||
|
||
const parser = new PdfParser(pdfjs);
|
||
|
||
test('basic example PDF parse', async () => {
|
||
const progressUpdates: Progress[] = [];
|
||
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
|
||
|
||
// to test
|
||
const result = await parser.parse(
|
||
data,
|
||
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
|
||
);
|
||
|
||
// verify pages
|
||
const expectedPages = 7;
|
||
expect(result.metadata.title()).toEqual('ExamplePdf');
|
||
expect(result.metadata.author()).toEqual('Johannes Zillmann');
|
||
expect(result.pageCount()).toBe(expectedPages);
|
||
result.pdfjsPages.forEach((pdfPage, i) => {
|
||
expect(pdfPage._pageIndex).toBe(i);
|
||
});
|
||
expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
|
||
expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
||
|
||
// verify first n items
|
||
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
|
||
new Item(0, {
|
||
str: 'Mega Überschrift',
|
||
dir: 'ltr',
|
||
width: 245.05800000000005,
|
||
height: 30,
|
||
transform: [30, 0, 0, 30, 175, 756],
|
||
fontName: 'g_d0_f1',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: '2te Überschrift',
|
||
dir: 'ltr',
|
||
width: 130.056,
|
||
height: 20,
|
||
transform: [20, 0, 0, 20, 233, 665],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: 'Dies ist eine Test-PDF',
|
||
dir: 'ltr',
|
||
width: 108.61950000000003,
|
||
height: 11,
|
||
transform: [11, 0, 0, 11, 240, 585],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: '.',
|
||
dir: 'ltr',
|
||
width: 3.0580000000000003,
|
||
height: 11,
|
||
transform: [11, 0, 0, 11, 352.6927, 585],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: '1',
|
||
dir: 'ltr',
|
||
width: 4.077333704,
|
||
height: 7.333334,
|
||
transform: [7.333334, 0, 0, 7.333334, 348, 588],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: 'Für’s Testen des ',
|
||
dir: 'ltr',
|
||
width: 83.7826,
|
||
height: 11,
|
||
transform: [11, 0, 0, 11, 208, 572],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: 'Markdown Parsers',
|
||
dir: 'ltr',
|
||
width: 91.6982,
|
||
height: 11,
|
||
transform: [11, 0, 0, 11, 291.77832, 572],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: '.',
|
||
dir: 'ltr',
|
||
width: 3.0580000000000003,
|
||
height: 11,
|
||
transform: [11, 0, 0, 11, 383.47360000000003, 572],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: ' ',
|
||
dir: 'ltr',
|
||
width: 3.0580000000000003,
|
||
height: 11,
|
||
transform: [11, 0, 0, 11, 61.078451, 59],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: 'In Deutsch.',
|
||
dir: 'ltr',
|
||
width: 55.64240000000001,
|
||
height: 11,
|
||
transform: [11, 0, 0, 11, 64.134603, 59],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: '1',
|
||
dir: 'ltr',
|
||
width: 4.077333704,
|
||
height: 7.333334,
|
||
transform: [7.333334, 0, 0, 7.333334, 57, 62],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: '\x00',
|
||
dir: 'ltr',
|
||
width: 0,
|
||
height: 12,
|
||
transform: [12, 0, 0, 12, 294, 45],
|
||
fontName: 'g_d0_f3',
|
||
}).withoutUuid(),
|
||
new Item(0, {
|
||
str: '1',
|
||
dir: 'ltr',
|
||
width: 6.672000000000001,
|
||
height: 12,
|
||
transform: [12, 0, 0, 12, 294, 45],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(1, {
|
||
str: '\x00',
|
||
dir: 'ltr',
|
||
width: 0,
|
||
height: 12,
|
||
transform: [12, 0, 0, 12, 294, 45],
|
||
fontName: 'g_d0_f3',
|
||
}).withoutUuid(),
|
||
new Item(1, {
|
||
str: '2',
|
||
dir: 'ltr',
|
||
width: 6.672000000000001,
|
||
height: 12,
|
||
transform: [12, 0, 0, 12, 294, 45],
|
||
fontName: 'g_d0_f2',
|
||
}).withoutUuid(),
|
||
new Item(2, {
|
||
str: 'Paragraphen',
|
||
dir: 'ltr',
|
||
width: 110.04479999999998,
|
||
height: 18,
|
||
transform: [18, 0, 0, 18, 57, 767],
|
||
fontName: 'g_d0_f1',
|
||
}).withoutUuid(),
|
||
]);
|
||
|
||
// verify progress
|
||
expect(progressUpdates.length).toBe(expectedPages + 3);
|
||
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
|
||
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
|
||
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
|
||
|
||
expect(progressUpdates[1].stageProgress).toEqual([1, 1, 0, 0]);
|
||
expect(progressUpdates[1].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
|
||
|
||
expect(progressUpdates[2].stageProgress).toEqual([1, 1, 1 / expectedPages, 0]);
|
||
expect(progressUpdates[2].stageDetails).toEqual([null, null, `1 / ${expectedPages}`, null]);
|
||
expect(progressUpdates[3].stageProgress).toEqual([1, 1, 2 / expectedPages, 0]);
|
||
expect(progressUpdates[3].stageDetails).toEqual([null, null, `2 / ${expectedPages}`, null]);
|
||
expect(progressUpdates[4].stageProgress).toEqual([1, 1, 3 / expectedPages, 0]);
|
||
expect(progressUpdates[4].stageDetails).toEqual([null, null, `3 / ${expectedPages}`, null]);
|
||
expect(progressUpdates[5].stageProgress).toEqual([1, 1, 4 / expectedPages, 0]);
|
||
expect(progressUpdates[5].stageDetails).toEqual([null, null, `4 / ${expectedPages}`, null]);
|
||
expect(progressUpdates[6].stageProgress).toEqual([1, 1, 5 / expectedPages, 0]);
|
||
expect(progressUpdates[6].stageDetails).toEqual([null, null, `5 / ${expectedPages}`, null]);
|
||
expect(progressUpdates[7].stageProgress).toEqual([1, 1, 6 / expectedPages, 0]);
|
||
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
|
||
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
|
||
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
|
||
expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]);
|
||
expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
|
||
});
|