diff --git a/core/src/Item.ts b/core/src/Item.ts new file mode 100644 index 0000000..6ebc104 --- /dev/null +++ b/core/src/Item.ts @@ -0,0 +1,13 @@ +export default class Item { + page: number; + data: object; + + constructor(page: number, data: object) { + this.page = page; + this.data = data; + } + + value(column: string): object { + return this.data[column]; + } +} diff --git a/core/src/ParseResult.ts b/core/src/ParseResult.ts index 2857dbd..014fbff 100644 --- a/core/src/ParseResult.ts +++ b/core/src/ParseResult.ts @@ -1,12 +1,20 @@ +import Item from './Item'; import type Metadata from './Metadata'; -import type ParsedPage from './ParsedPage'; export default class ParseResult { + pdfPages: any[]; metadata: Metadata; - pages: ParsedPage[]; + columns: string[]; + items: Item[]; - constructor(metadata: Metadata, pages: ParsedPage[]) { + constructor(pdfPages: any[], metadata: Metadata, columns: string[], items: Item[]) { + this.pdfPages = pdfPages; this.metadata = metadata; - this.pages = pages; + this.columns = columns; + this.items = items; + } + + pageCount(): number { + return this.pdfPages.length; } } diff --git a/core/src/ParsedPage.ts b/core/src/ParsedPage.ts index a27c1eb..244dd38 100644 --- a/core/src/ParsedPage.ts +++ b/core/src/ParsedPage.ts @@ -2,12 +2,12 @@ import type ParsedPageItem from './ParsedPageItem'; export default class ParsedPage { index: number; - viewPortTransform: number[]; + pdfPage: any; items: ParsedPageItem[]; - constructor(index: number, viewPortTransform: number[], items: ParsedPageItem[]) { + constructor(index: number, pdfPage: any, items: ParsedPageItem[]) { this.index = index; - this.viewPortTransform = viewPortTransform; + this.pdfPage = pdfPage; this.items = items; } } diff --git a/core/src/PdfParser.ts b/core/src/PdfParser.ts index 21f6a79..89fa05d 100644 --- a/core/src/PdfParser.ts +++ b/core/src/PdfParser.ts @@ -1,3 +1,4 @@ +import Item from './Item'; import Metadata from './Metadata'; import ParsedPage from './ParsedPage'; import type ParseReporter from './ParseReporter'; @@ -10,6 +11,7 @@ import type TextItem from './TextItem'; */ export default class PdfParser { pdfjs: any; + columns = ['str', 'dir', 'width', 'height', 'transfom', 'fontName']; constructor(pdfjs: any) { this.pdfjs = pdfjs; } @@ -43,14 +45,17 @@ export default class PdfParser { this.extractPagesSequentially(pdfDocument, reporter), ]); }) - .then(([metadata, pages]) => new ParseResult(new Metadata(metadata), pages)); + .then(([metadata, pages]) => { + const pdfPages = pages.map((page) => page.pdfPage); + const items = pages.reduce((allItems, page) => allItems.concat(page.items), []); + return new ParseResult(pdfPages, new Metadata(metadata), this.columns, items); + }); } private extractPagesSequentially(pdfDocument: any, reporter: ParseReporter): Promise { return [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { return accumulatorPromise.then((accumulatedResults) => { return pdfDocument.getPage(index + 1).then((page) => { - const viewport = page.getViewport({ scale: 1.0 }); return this.triggerFontRetrieval(page).then(() => page .getTextContent({ @@ -58,8 +63,9 @@ export default class PdfParser { disableCombineTextItems: true, }) .then((textContent) => { + const items = textContent.items.map((rawItem) => new Item(index, rawItem)); reporter.parsedPage(index); - return [...accumulatedResults, new ParsedPage(index, viewport.transform, textContent.items)]; + return [...accumulatedResults, new ParsedPage(index, page, items)]; }), ); }); @@ -119,7 +125,7 @@ export default class PdfParser { // console.log('Parsed result:', r.length); // console.log('Parsed result:', r); - return new ParseResult(new Metadata(metadata), r); + return new ParseResult([], new Metadata(metadata), [], []); }); } } diff --git a/core/test/PdfParser.test.ts b/core/test/PdfParser.test.ts index 6ed7ce0..67b619b 100644 --- a/core/test/PdfParser.test.ts +++ b/core/test/PdfParser.test.ts @@ -3,129 +3,164 @@ import * as pdfjs from 'pdfjs-dist/es5/build/pdf'; import * as fs from 'fs'; import ParseProgressReporter from 'src/ParseProgressReporter'; import Progress from 'src/Progress'; +import Item from 'src/Item'; const parser = new PdfParser(pdfjs); test('basic example PDF parse', async () => { const progressUpdates: Progress[] = []; const data = fs.readFileSync('../examples/ExamplePdf.pdf', null); + + // to test const result = await parser.parseBytes( data, new ParseProgressReporter((progress) => progressUpdates.push(JSON.parse(JSON.stringify(progress)) as Progress)), ); + + // verify pages const expectedPages = 7; expect(result.metadata.title()).toEqual('ExamplePdf'); expect(result.metadata.author()).toEqual('Johannes Zillmann'); - expect(result.pages.length).toBe(expectedPages); - expect(result.pages[0].index).toBe(0); - expect(result.pages[0].viewPortTransform).toEqual([1, 0, 0, -1, 0, 841.8898]); - expect(result.pages[0].items).toEqual([ - { + expect(result.pageCount()).toBe(expectedPages); + result.pdfPages.forEach((pdfPage, i) => { + expect(pdfPage._pageIndex).toBe(i); + }); + expect(result.pdfPages[0]._pageInfo.view).toEqual([0, 0, 595.2756, 841.8898]); + expect(result.pdfPages[0].getViewport({ scale: 1.0 }).transform).toEqual([1, 0, 0, -1, 0, 841.8898]); + + // verify first n items + expect(result.items.slice(0, 16)).toEqual([ + new Item(0, { str: 'Mega Überschrift', dir: 'ltr', width: 245.05800000000005, height: 30, transform: [30, 0, 0, 30, 175, 756], fontName: 'g_d0_f1', - }, - { + }), + new Item(0, { str: '2te Überschrift', dir: 'ltr', width: 130.056, height: 20, transform: [20, 0, 0, 20, 233, 665], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: 'Dies ist eine Test-PDF', dir: 'ltr', width: 108.61950000000003, height: 11, transform: [11, 0, 0, 11, 240, 585], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: '.', dir: 'ltr', width: 3.0580000000000003, height: 11, transform: [11, 0, 0, 11, 352.6927, 585], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: '1', dir: 'ltr', width: 4.077333704, height: 7.333334, transform: [7.333334, 0, 0, 7.333334, 348, 588], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: 'Für’s Testen des ', dir: 'ltr', width: 83.7826, height: 11, transform: [11, 0, 0, 11, 208, 572], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: 'Markdown Parsers', dir: 'ltr', width: 91.6982, height: 11, transform: [11, 0, 0, 11, 291.77832, 572], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: '.', dir: 'ltr', width: 3.0580000000000003, height: 11, transform: [11, 0, 0, 11, 383.47360000000003, 572], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: ' ', dir: 'ltr', width: 3.0580000000000003, height: 11, transform: [11, 0, 0, 11, 61.078451, 59], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: 'In Deutsch.', dir: 'ltr', width: 55.64240000000001, height: 11, transform: [11, 0, 0, 11, 64.134603, 59], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: '1', dir: 'ltr', width: 4.077333704, height: 7.333334, transform: [7.333334, 0, 0, 7.333334, 57, 62], fontName: 'g_d0_f2', - }, - { + }), + new Item(0, { str: '\x00', dir: 'ltr', width: 0, height: 12, transform: [12, 0, 0, 12, 294, 45], fontName: 'g_d0_f3', - }, - { + }), + new Item(0, { str: '1', dir: 'ltr', width: 6.672000000000001, height: 12, transform: [12, 0, 0, 12, 294, 45], fontName: 'g_d0_f2', - }, + }), + new Item(1, { + str: '\x00', + dir: 'ltr', + width: 0, + height: 12, + transform: [12, 0, 0, 12, 294, 45], + fontName: 'g_d0_f3', + }), + new Item(1, { + str: '2', + dir: 'ltr', + width: 6.672000000000001, + height: 12, + transform: [12, 0, 0, 12, 294, 45], + fontName: 'g_d0_f2', + }), + new Item(2, { + str: 'Paragraphen', + dir: 'ltr', + width: 110.04479999999998, + height: 18, + transform: [18, 0, 0, 18, 57, 767], + fontName: 'g_d0_f1', + }), ]); + // verify progress expect(progressUpdates.length).toBe(expectedPages + 2); progressUpdates.forEach((update) => expect(update.stages).toEqual(['Document Header', 'Metadata', 'Pages', 'Fonts'])); expect(progressUpdates[0].stageProgress).toEqual([1, 0, 0, 0]); diff --git a/ui/src/Result.svelte b/ui/src/Result.svelte index ea8c877..fddfae3 100644 --- a/ui/src/Result.svelte +++ b/ui/src/Result.svelte @@ -1,22 +1,15 @@ -
- Parsed - {parseResult.pages.length} - pages with - {parseResult.pages.reduce((count, page) => count + page.items.length, 0)} - items +
+
Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items
+
Title: {parseResult.metadata.title()}
+
Author: {parseResult.metadata.author()}
-
Title: {parseResult.metadata.title()}
-
Author: {parseResult.metadata.author()}
-{#each parseResult.pages as page} -
Page {page.index}
- -{/each} +
diff --git a/ui/src/Table.svelte b/ui/src/Table.svelte index 674122f..61af74e 100644 --- a/ui/src/Table.svelte +++ b/ui/src/Table.svelte @@ -1,52 +1,67 @@ -
-
- {#each headers as header} -
{header}
+
+ + + {#each columns as column} + {/each} - {#each items as item, i} -
-
{i + 1}
-
{item.str}
-
{item.fontName}
-
{item.dir}
-
{item.width}
-
{item.height}
-
{item.transform.join(', ')}
-
+ + + {#each items as item, idx} + {#if idx > 0 && item.page !== items[idx - 1].page} + + {/if} + + {#if idx === 0 || item.page !== items[idx - 1].page} + + {:else} + + {#each columns as column} + + {/each} + {/each} - - + +
+ #{column}
Page {item.page} + {/if} + {idx}{item.data[column]}