From a98a862a586916d8d75c3a4e6ba663a39204b330 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Tue, 9 Feb 2021 22:48:56 +0100 Subject: [PATCH] Transform messages --- core/src/Debugger.ts | 18 ++-- core/src/ItemResult.ts | 6 ++ core/src/PdfParser.ts | 100 +++++++++---------- core/src/transformer/AdjustHeight.ts | 7 +- core/src/transformer/CalculateCoordinates.ts | 8 +- core/src/transformer/ItemTransformer.ts | 3 +- ui/src/App.svelte | 2 +- ui/src/debug/DebugView.svelte | 18 ++-- ui/src/debug/ItemTable.svelte | 2 +- 9 files changed, 92 insertions(+), 72 deletions(-) create mode 100644 core/src/ItemResult.ts diff --git a/core/src/Debugger.ts b/core/src/Debugger.ts index 294e673..da579b5 100644 --- a/core/src/Debugger.ts +++ b/core/src/Debugger.ts @@ -1,7 +1,7 @@ import { assert } from './assert'; import Item from './Item'; +import ItemResult from './ItemResult'; import ItemTransformer from './transformer/ItemTransformer'; -import ParseResult from './ParseResult'; import { calculateSchemas } from './transformer/transformerUtil'; import TransformContext from './transformer/TransformContext'; @@ -11,7 +11,7 @@ export default class Debugger { transformers: ItemTransformer[]; stageNames: string[]; stageSchema: string[][]; - private stageItems: Item[][]; + private stageResultCache: ItemResult[]; constructor( initialSchema: string[], @@ -23,19 +23,21 @@ export default class Debugger { this.transformers = transformers; this.context = context; this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)]; - this.stageItems = [initialItems]; + this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }]; this.stageSchema = calculateSchemas(initialSchema, transformers); } //TODO return MarkedItem ? (removed, added, etc..)? //TODO StageResult == class with schema and marked items ? - stageResults(stageIndex: number): Item[] { + stageResults(stageIndex: number): ItemResult { for (let idx = 0; idx < stageIndex + 1; idx++) { - if (!this.stageItems[idx]) { - const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]); - this.stageItems.push(stageItems); + if (!this.stageResultCache[idx]) { + const stageResult = this.transformers[idx - 1].transform(this.context, [ + ...this.stageResultCache[idx - 1].items, + ]); + this.stageResultCache.push(stageResult); } } - return this.stageItems[stageIndex]; + return this.stageResultCache[stageIndex]; } } diff --git a/core/src/ItemResult.ts b/core/src/ItemResult.ts new file mode 100644 index 0000000..19d7c56 --- /dev/null +++ b/core/src/ItemResult.ts @@ -0,0 +1,6 @@ +import type Item from './Item'; + +export default interface ItemResult { + items: Item[]; + messages: string[]; +} diff --git a/core/src/PdfParser.ts b/core/src/PdfParser.ts index 09f5b56..66753b1 100644 --- a/core/src/PdfParser.ts +++ b/core/src/PdfParser.ts @@ -2,8 +2,6 @@ import Item from './Item'; import Metadata from './Metadata'; import type ParseReporter from './ParseReporter'; import ParseResult from './ParseResult'; -import TextDirection from './TextDirection'; -import type TextItem from './TextItem'; /** * Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS. @@ -86,59 +84,59 @@ export default class PdfParser { return page.getOperatorList(); } - async parseOld(data: Uint8Array): Promise { - return this.pdfjs - .getDocument({ - data, - cMapUrl: 'cmaps/', - cMapPacked: true, - }) - .promise.then((pdfDocument) => { - // console.log('result', pdfDocument); - const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { - return accumulatorPromise.then((accumulatedResults) => { - // console.log('Parsing page ' + index); - return pdfDocument.getPage(index + 1).then((page) => { - const viewport = page.getViewport({ scale: 1.0 }); - console.log(viewport); + // async parseOld(data: Uint8Array): Promise { + // return this.pdfjs + // .getDocument({ + // data, + // cMapUrl: 'cmaps/', + // cMapPacked: true, + // }) + // .promise.then((pdfDocument) => { + // // console.log('result', pdfDocument); + // const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { + // return accumulatorPromise.then((accumulatedResults) => { + // // console.log('Parsing page ' + index); + // return pdfDocument.getPage(index + 1).then((page) => { + // const viewport = page.getViewport({ scale: 1.0 }); + // console.log(viewport); - return this.triggerFontRetrieval(page).then(() => - page.getTextContent().then((textContent) => { - // console.log(textContent); - const textItems: TextItem[] = textContent.items.map((item) => { - const tx = this.pdfjs.Util.transform(viewport.transform, item.transform); - const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); - const dividedHeight = item.height / fontHeight; + // return this.triggerFontRetrieval(page).then(() => + // page.getTextContent().then((textContent) => { + // // console.log(textContent); + // const textItems: TextItem[] = textContent.items.map((item) => { + // const tx = this.pdfjs.Util.transform(viewport.transform, item.transform); + // const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); + // const dividedHeight = item.height / fontHeight; - return { - x: Math.round(item.transform[4]), - y: Math.round(item.transform[5]), - width: Math.round(item.width), - height: Math.round( - Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight, - ), - text: item.str, - textDirection: TextDirection.fromPdfJs(item.dir), - fontId: item.fontName, - }; - }); + // return { + // x: Math.round(item.transform[4]), + // y: Math.round(item.transform[5]), + // width: Math.round(item.width), + // height: Math.round( + // Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight, + // ), + // text: item.str, + // textDirection: TextDirection.fromPdfJs(item.dir), + // fontId: item.fontName, + // }; + // }); - return [...accumulatedResults, ...textItems]; - }), - ); - }); - }); - }, Promise.resolve([])); - return Promise.all([pdfDocument.getMetadata(), result]); - }) - .then(([metadata, r]) => { - // console.log('Parsed metadata:', metadata); - // console.log('Parsed result:', r.length); - // console.log('Parsed result:', r); + // return [...accumulatedResults, ...textItems]; + // }), + // ); + // }); + // }); + // }, Promise.resolve([])); + // return Promise.all([pdfDocument.getMetadata(), result]); + // }) + // .then(([metadata, r]) => { + // // console.log('Parsed metadata:', metadata); + // // console.log('Parsed result:', r.length); + // // console.log('Parsed result:', r); - return {}; - }); - } + // return {}; + // }); + // } } interface ParsedPage { diff --git a/core/src/transformer/AdjustHeight.ts b/core/src/transformer/AdjustHeight.ts index 3749dc2..e6dbfb8 100644 --- a/core/src/transformer/AdjustHeight.ts +++ b/core/src/transformer/AdjustHeight.ts @@ -1,5 +1,6 @@ import PageViewport from 'src/parse/PageViewport'; import Item from '../Item'; +import ItemResult from '../ItemResult'; import ItemTransformer from './ItemTransformer'; import TransformContext from './TransformContext'; @@ -10,11 +11,12 @@ export default class AdjustHeight extends ItemTransformer { }); } - transform(context: TransformContext, items: Item[]): Item[] { + transform(context: TransformContext, items: Item[]): ItemResult { const newItems: Item[] = []; let page = -1; let pageViewport: PageViewport; //TODO groupBy page + let correctedHeights = 0; items.forEach((item) => { if (item.page !== page) { pageViewport = context.pageViewports[item.page]; @@ -27,11 +29,12 @@ export default class AdjustHeight extends ItemTransformer { const dividedHeight = itemHeight / fontHeight; const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight; if (newHeight !== itemHeight) { + correctedHeights++; newItems.push(item.withDataAddition({ height: newHeight })); } else { newItems.push(item); } }); - return items; + return { items, messages: [`${correctedHeights} corrected heights`] }; } } diff --git a/core/src/transformer/CalculateCoordinates.ts b/core/src/transformer/CalculateCoordinates.ts index 9e30a00..af83047 100644 --- a/core/src/transformer/CalculateCoordinates.ts +++ b/core/src/transformer/CalculateCoordinates.ts @@ -1,4 +1,5 @@ import Item from '../Item'; +import ItemResult from '../ItemResult'; import ItemTransformer from './ItemTransformer'; import TransformContext from './TransformContext'; @@ -11,9 +12,12 @@ export default class CalculateCoordinates extends ItemTransformer { }); } - transform(context: TransformContext, items: Item[]): Item[] { + transform(context: TransformContext, items: Item[]): ItemResult { // const transform: number[] = item.value['Transform']; items.shift(); - return items; + if(items[0]){ + items[0].data['fontName']='xxx'; + } + return { items, messages: [] }; } } diff --git a/core/src/transformer/ItemTransformer.ts b/core/src/transformer/ItemTransformer.ts index d89284f..cae0cb4 100644 --- a/core/src/transformer/ItemTransformer.ts +++ b/core/src/transformer/ItemTransformer.ts @@ -1,6 +1,7 @@ import TransformerDescription from '../TransformerDescription'; import type Item from '../Item'; import TransformContext from './TransformContext'; +import ItemResult from 'src/ItemResult'; export default abstract class ItemTransformer { readonly name: string; @@ -21,5 +22,5 @@ export default abstract class ItemTransformer { } // columnar-changes: described - abstract transform(context: TransformContext, items: Item[]): Item[]; + abstract transform(context: TransformContext, items: Item[]): ItemResult; } diff --git a/ui/src/App.svelte b/ui/src/App.svelte index 626fa5a..4f89b5a 100644 --- a/ui/src/App.svelte +++ b/ui/src/App.svelte @@ -6,7 +6,7 @@
PDF to Markdown Converter
-
+
{#if $debug} {:else} diff --git a/ui/src/debug/DebugView.svelte b/ui/src/debug/DebugView.svelte index b5fa76d..6d2cd00 100644 --- a/ui/src/debug/DebugView.svelte +++ b/ui/src/debug/DebugView.svelte @@ -14,12 +14,12 @@ $: canNext = currentStage + 1 < stageNames.length; $: canPrev = currentStage > 0; $: stageSchema = debug.stageSchema[currentStage]; - $: stageItems = debug.stageResults(currentStage); + $: stageResult = debug.stageResults(currentStage); $: pageFocus = !isNaN(focusedPage); - $: pagesNumbers = new Set(stageItems.map((item) => item.page)); + $: pagesNumbers = new Set(stageResult.items.map((item) => item.page)); $: maxPage = Math.max(...pagesNumbers); $: itemsByPage = [ - ...stageItems.reduce((map, item) => { + ...stageResult.items.reduce((map, item) => { if (!map.has(item.page)) { map.set(item.page, []); } @@ -41,11 +41,9 @@
-
-
@@ -89,10 +87,18 @@ canNext && currentStage++}> -
{stageNames[currentStage]}
+
{stageNames[currentStage]}
+ +
    + {#each stageResult.messages as message} +
  • {message}
  • + {/each} +
+ + diff --git a/ui/src/debug/ItemTable.svelte b/ui/src/debug/ItemTable.svelte index 73f5cf5..636d26a 100644 --- a/ui/src/debug/ItemTable.svelte +++ b/ui/src/debug/ItemTable.svelte @@ -51,7 +51,7 @@ {/if} {itemIdx} {#each schema as column} - {format(item.data[column])} + {format(item.data[column])} {/each} {/each}