2020-12-20 19:01:03 +01:00
|
|
|
|
import * as pdfjs from 'pdfjs-dist/es5/build/pdf';
|
|
|
|
|
import * as fs from 'fs';
|
2021-03-13 08:46:22 +01:00
|
|
|
|
|
|
|
|
|
import PdfParser from 'src/PdfParser';
|
2021-01-12 22:54:22 +01:00
|
|
|
|
import ParseProgressReporter from 'src/ParseProgressReporter';
|
|
|
|
|
import Progress from 'src/Progress';
|
2021-01-28 23:06:37 +01:00
|
|
|
|
import Item from 'src/Item';
|
2020-12-20 19:01:03 +01:00
|
|
|
|
|
|
|
|
|
const parser = new PdfParser(pdfjs);
|
|
|
|
|
|
2021-01-12 22:54:22 +01:00
|
|
|
|
test('basic example PDF parse', async () => {
|
|
|
|
|
const progressUpdates: Progress[] = [];
|
2020-12-20 19:01:03 +01:00
|
|
|
|
const data = fs.readFileSync('../examples/ExamplePdf.pdf', null);
|
2021-01-28 23:06:37 +01:00
|
|
|
|
|
|
|
|
|
// to test
|
2021-02-05 18:28:04 +01:00
|
|
|
|
const result = await parser.parse(
|
2021-01-12 22:54:22 +01:00
|
|
|
|
data,
|
|
|
|
|
new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)),
|
|
|
|
|
);
|
2021-01-28 23:06:37 +01:00
|
|
|
|
|
|
|
|
|
// verify pages
|
2021-01-12 22:54:22 +01:00
|
|
|
|
const expectedPages = 7;
|
2020-12-20 19:01:03 +01:00
|
|
|
|
expect(result.metadata.title()).toEqual('ExamplePdf');
|
|
|
|
|
expect(result.metadata.author()).toEqual('Johannes Zillmann');
|
2021-03-09 08:17:50 +01:00
|
|
|
|
expect(result.pageCount).toBe(expectedPages);
|
2021-02-20 13:06:29 +01:00
|
|
|
|
result.pdfjsPages.forEach((pdfPage, i) => {
|
2021-01-28 23:06:37 +01:00
|
|
|
|
expect(pdfPage._pageIndex).toBe(i);
|
|
|
|
|
});
|
2021-02-20 13:06:29 +01:00
|
|
|
|
expect(result.pdfjsPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]);
|
|
|
|
|
expect(result.pdfjsPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]);
|
2021-01-28 23:06:37 +01:00
|
|
|
|
|
|
|
|
|
// verify first n items
|
2021-02-05 18:28:04 +01:00
|
|
|
|
expect(result.items.slice(0, 16).map((item) => item.withoutUuid())).toEqual([
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: 'Mega Überschrift',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 245.05800000000005,
|
|
|
|
|
height: 30,
|
|
|
|
|
transform: [30, 0, 0, 30, 175, 756],
|
|
|
|
|
fontName: 'g_d0_f1',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: '2te Überschrift',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 130.056,
|
|
|
|
|
height: 20,
|
|
|
|
|
transform: [20, 0, 0, 20, 233, 665],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: 'Dies ist eine Test-PDF',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 108.61950000000003,
|
|
|
|
|
height: 11,
|
|
|
|
|
transform: [11, 0, 0, 11, 240, 585],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: '.',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 3.0580000000000003,
|
|
|
|
|
height: 11,
|
|
|
|
|
transform: [11, 0, 0, 11, 352.6927, 585],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: '1',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 4.077333704,
|
|
|
|
|
height: 7.333334,
|
|
|
|
|
transform: [7.333334, 0, 0, 7.333334, 348, 588],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: 'Für’s Testen des ',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 83.7826,
|
|
|
|
|
height: 11,
|
|
|
|
|
transform: [11, 0, 0, 11, 208, 572],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: 'Markdown Parsers',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 91.6982,
|
|
|
|
|
height: 11,
|
|
|
|
|
transform: [11, 0, 0, 11, 291.77832, 572],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: '.',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 3.0580000000000003,
|
|
|
|
|
height: 11,
|
|
|
|
|
transform: [11, 0, 0, 11, 383.47360000000003, 572],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: ' ',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 3.0580000000000003,
|
|
|
|
|
height: 11,
|
|
|
|
|
transform: [11, 0, 0, 11, 61.078451, 59],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: 'In Deutsch.',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 55.64240000000001,
|
|
|
|
|
height: 11,
|
|
|
|
|
transform: [11, 0, 0, 11, 64.134603, 59],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: '1',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 4.077333704,
|
|
|
|
|
height: 7.333334,
|
|
|
|
|
transform: [7.333334, 0, 0, 7.333334, 57, 62],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: '\x00',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 0,
|
|
|
|
|
height: 12,
|
|
|
|
|
transform: [12, 0, 0, 12, 294, 45],
|
|
|
|
|
fontName: 'g_d0_f3',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(0, {
|
2020-12-20 19:01:03 +01:00
|
|
|
|
str: '1',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 6.672000000000001,
|
|
|
|
|
height: 12,
|
|
|
|
|
transform: [12, 0, 0, 12, 294, 45],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(1, {
|
|
|
|
|
str: '\x00',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 0,
|
|
|
|
|
height: 12,
|
|
|
|
|
transform: [12, 0, 0, 12, 294, 45],
|
|
|
|
|
fontName: 'g_d0_f3',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(1, {
|
|
|
|
|
str: '2',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 6.672000000000001,
|
|
|
|
|
height: 12,
|
|
|
|
|
transform: [12, 0, 0, 12, 294, 45],
|
|
|
|
|
fontName: 'g_d0_f2',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2021-01-28 23:06:37 +01:00
|
|
|
|
new Item(2, {
|
|
|
|
|
str: 'Paragraphen',
|
|
|
|
|
dir: 'ltr',
|
|
|
|
|
width: 110.04479999999998,
|
|
|
|
|
height: 18,
|
|
|
|
|
transform: [18, 0, 0, 18, 57, 767],
|
|
|
|
|
fontName: 'g_d0_f1',
|
2021-02-05 18:28:04 +01:00
|
|
|
|
}).withoutUuid(),
|
2020-12-20 19:01:03 +01:00
|
|
|
|
]);
|
2021-01-12 22:54:22 +01:00
|
|
|
|
|
2021-01-28 23:06:37 +01:00
|
|
|
|
// verify progress
|
2021-02-20 13:06:29 +01:00
|
|
|
|
expect(progressUpdates.length).toBe(expectedPages + 3);
|
2021-01-12 22:54:22 +01:00
|
|
|
|
progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts']));
|
|
|
|
|
expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]);
|
|
|
|
|
expect(progressUpdates[0].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
|
|
|
|
|
|
|
|
|
|
expect(progressUpdates[1].stageProgress).toEqual([1, 1, 0, 0]);
|
|
|
|
|
expect(progressUpdates[1].stageDetails).toEqual([null, null, `0 / ${expectedPages}`, null]);
|
|
|
|
|
|
|
|
|
|
expect(progressUpdates[2].stageProgress).toEqual([1, 1, 1 / expectedPages, 0]);
|
|
|
|
|
expect(progressUpdates[2].stageDetails).toEqual([null, null, `1 / ${expectedPages}`, null]);
|
|
|
|
|
expect(progressUpdates[3].stageProgress).toEqual([1, 1, 2 / expectedPages, 0]);
|
|
|
|
|
expect(progressUpdates[3].stageDetails).toEqual([null, null, `2 / ${expectedPages}`, null]);
|
|
|
|
|
expect(progressUpdates[4].stageProgress).toEqual([1, 1, 3 / expectedPages, 0]);
|
|
|
|
|
expect(progressUpdates[4].stageDetails).toEqual([null, null, `3 / ${expectedPages}`, null]);
|
|
|
|
|
expect(progressUpdates[5].stageProgress).toEqual([1, 1, 4 / expectedPages, 0]);
|
|
|
|
|
expect(progressUpdates[5].stageDetails).toEqual([null, null, `4 / ${expectedPages}`, null]);
|
|
|
|
|
expect(progressUpdates[6].stageProgress).toEqual([1, 1, 5 / expectedPages, 0]);
|
|
|
|
|
expect(progressUpdates[6].stageDetails).toEqual([null, null, `5 / ${expectedPages}`, null]);
|
|
|
|
|
expect(progressUpdates[7].stageProgress).toEqual([1, 1, 6 / expectedPages, 0]);
|
|
|
|
|
expect(progressUpdates[7].stageDetails).toEqual([null, null, `6 / ${expectedPages}`, null]);
|
|
|
|
|
expect(progressUpdates[8].stageProgress).toEqual([1, 1, 7 / expectedPages, 0]);
|
|
|
|
|
expect(progressUpdates[8].stageDetails).toEqual([null, null, `7 / ${expectedPages}`, null]);
|
2021-02-20 13:06:29 +01:00
|
|
|
|
expect(progressUpdates[9].stageProgress).toEqual([1, 1, 1, 1]);
|
|
|
|
|
expect(progressUpdates[9].stageDetails).toEqual([null, null, `${expectedPages} / ${expectedPages}`, null]);
|
2020-12-20 19:01:03 +01:00
|
|
|
|
});
|