Transform messages

This commit is contained in:
Johannes Zillmann 2021-02-09 22:48:56 +01:00
parent 4401f1fb5c
commit a98a862a58
9 changed files with 92 additions and 72 deletions

View File

@ -1,7 +1,7 @@
import { assert } from './assert';
import Item from './Item';
import ItemResult from './ItemResult';
import ItemTransformer from './transformer/ItemTransformer';
import ParseResult from './ParseResult';
import { calculateSchemas } from './transformer/transformerUtil';
import TransformContext from './transformer/TransformContext';
@ -11,7 +11,7 @@ export default class Debugger {
transformers: ItemTransformer[];
stageNames: string[];
stageSchema: string[][];
private stageItems: Item[][];
private stageResultCache: ItemResult[];
constructor(
initialSchema: string[],
@ -23,19 +23,21 @@ export default class Debugger {
this.transformers = transformers;
this.context = context;
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
this.stageItems = [initialItems];
this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }];
this.stageSchema = calculateSchemas(initialSchema, transformers);
}
//TODO return MarkedItem ? (removed, added, etc..)?
//TODO StageResult == class with schema and marked items ?
stageResults(stageIndex: number): Item[] {
stageResults(stageIndex: number): ItemResult {
for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageItems[idx]) {
const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]);
this.stageItems.push(stageItems);
if (!this.stageResultCache[idx]) {
const stageResult = this.transformers[idx - 1].transform(this.context, [
...this.stageResultCache[idx - 1].items,
]);
this.stageResultCache.push(stageResult);
}
}
return this.stageItems[stageIndex];
return this.stageResultCache[stageIndex];
}
}

6
core/src/ItemResult.ts Normal file
View File

@ -0,0 +1,6 @@
import type Item from './Item';
export default interface ItemResult {
items: Item[];
messages: string[];
}

View File

@ -2,8 +2,6 @@ import Item from './Item';
import Metadata from './Metadata';
import type ParseReporter from './ParseReporter';
import ParseResult from './ParseResult';
import TextDirection from './TextDirection';
import type TextItem from './TextItem';
/**
* Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS.
@ -86,59 +84,59 @@ export default class PdfParser {
return page.getOperatorList();
}
async parseOld(data: Uint8Array): Promise<ParseResult> {
return this.pdfjs
.getDocument({
data,
cMapUrl: 'cmaps/',
cMapPacked: true,
})
.promise.then((pdfDocument) => {
// console.log('result', pdfDocument);
const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => {
// console.log('Parsing page ' + index);
return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 });
console.log(viewport);
// async parseOld(data: Uint8Array): Promise<ParseResult> {
// return this.pdfjs
// .getDocument({
// data,
// cMapUrl: 'cmaps/',
// cMapPacked: true,
// })
// .promise.then((pdfDocument) => {
// // console.log('result', pdfDocument);
// const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
// return accumulatorPromise.then((accumulatedResults) => {
// // console.log('Parsing page ' + index);
// return pdfDocument.getPage(index + 1).then((page) => {
// const viewport = page.getViewport({ scale: 1.0 });
// console.log(viewport);
return this.triggerFontRetrieval(page).then(() =>
page.getTextContent().then((textContent) => {
// console.log(textContent);
const textItems: TextItem[] = textContent.items.map((item) => {
const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
const dividedHeight = item.height / fontHeight;
// return this.triggerFontRetrieval(page).then(() =>
// page.getTextContent().then((textContent) => {
// // console.log(textContent);
// const textItems: TextItem[] = textContent.items.map((item) => {
// const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
// const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
// const dividedHeight = item.height / fontHeight;
return {
x: Math.round(item.transform[4]),
y: Math.round(item.transform[5]),
width: Math.round(item.width),
height: Math.round(
Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
),
text: item.str,
textDirection: TextDirection.fromPdfJs(item.dir),
fontId: item.fontName,
};
});
// return {
// x: Math.round(item.transform[4]),
// y: Math.round(item.transform[5]),
// width: Math.round(item.width),
// height: Math.round(
// Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
// ),
// text: item.str,
// textDirection: TextDirection.fromPdfJs(item.dir),
// fontId: item.fontName,
// };
// });
return [...accumulatedResults, ...textItems];
}),
);
});
});
}, Promise.resolve([]));
return Promise.all([pdfDocument.getMetadata(), result]);
})
.then(([metadata, r]) => {
// console.log('Parsed metadata:', metadata);
// console.log('Parsed result:', r.length);
// console.log('Parsed result:', r);
// return [...accumulatedResults, ...textItems];
// }),
// );
// });
// });
// }, Promise.resolve([]));
// return Promise.all([pdfDocument.getMetadata(), result]);
// })
// .then(([metadata, r]) => {
// // console.log('Parsed metadata:', metadata);
// // console.log('Parsed result:', r.length);
// // console.log('Parsed result:', r);
return {};
});
}
// return {};
// });
// }
}
interface ParsedPage {

View File

@ -1,5 +1,6 @@
import PageViewport from 'src/parse/PageViewport';
import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
@ -10,11 +11,12 @@ export default class AdjustHeight extends ItemTransformer {
});
}
transform(context: TransformContext, items: Item[]): Item[] {
transform(context: TransformContext, items: Item[]): ItemResult {
const newItems: Item[] = [];
let page = -1;
let pageViewport: PageViewport;
//TODO groupBy page
let correctedHeights = 0;
items.forEach((item) => {
if (item.page !== page) {
pageViewport = context.pageViewports[item.page];
@ -27,11 +29,12 @@ export default class AdjustHeight extends ItemTransformer {
const dividedHeight = itemHeight / fontHeight;
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
if (newHeight !== itemHeight) {
correctedHeights++;
newItems.push(item.withDataAddition({ height: newHeight }));
} else {
newItems.push(item);
}
});
return items;
return { items, messages: [`${correctedHeights} corrected heights`] };
}
}

View File

@ -1,4 +1,5 @@
import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
@ -11,9 +12,12 @@ export default class CalculateCoordinates extends ItemTransformer {
});
}
transform(context: TransformContext, items: Item[]): Item[] {
transform(context: TransformContext, items: Item[]): ItemResult {
// const transform: number[] = item.value['Transform'];
items.shift();
return items;
if(items[0]){
items[0].data['fontName']='xxx';
}
return { items, messages: [] };
}
}

View File

@ -1,6 +1,7 @@
import TransformerDescription from '../TransformerDescription';
import type Item from '../Item';
import TransformContext from './TransformContext';
import ItemResult from 'src/ItemResult';
export default abstract class ItemTransformer {
readonly name: string;
@ -21,5 +22,5 @@ export default abstract class ItemTransformer {
}
// columnar-changes: described
abstract transform(context: TransformContext, items: Item[]): Item[];
abstract transform(context: TransformContext, items: Item[]): ItemResult;
}

View File

@ -6,7 +6,7 @@
</script>
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
<main class="mt-5 h-full">
<main class="mt-2 h-full">
{#if $debug}
<DebugView debug={$debug} />
{:else}

View File

@ -14,12 +14,12 @@
$: canNext = currentStage + 1 < stageNames.length;
$: canPrev = currentStage > 0;
$: stageSchema = debug.stageSchema[currentStage];
$: stageItems = debug.stageResults(currentStage);
$: stageResult = debug.stageResults(currentStage);
$: pageFocus = !isNaN(focusedPage);
$: pagesNumbers = new Set(stageItems.map((item) => item.page));
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
$: maxPage = Math.max(...pagesNumbers);
$: itemsByPage = [
...stageItems.reduce((map, item) => {
...stageResult.items.reduce((map, item) => {
if (!map.has(item.page)) {
map.set(item.page, []);
}
@ -41,11 +41,9 @@
</script>
<div class="mx-4">
<div class="mb-4">
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
<div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div> -->
</div>
<!-- Sticky Controls -->
<div class="controls py-2">
@ -89,10 +87,18 @@
<span on:click={() => canNext && currentStage++}>
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
</span>
<div>{stageNames[currentStage]}</div>
<div class="cursor-pointer hover:underline">{stageNames[currentStage]}</div>
</div>
</div>
<!-- Stage Messages -->
<ul class="list-disc list-inside mb-2 p-2 bg-yellow-100 rounded shadow text-sm">
{#each stageResult.messages as message}
<li>{message}</li>
{/each}
</ul>
<!-- Items -->
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
</div>

View File

@ -51,7 +51,7 @@
{/if}
<td>{itemIdx}</td>
{#each schema as column}
<td>{format(item.data[column])}</td>
<td class="select-all">{format(item.data[column])}</td>
{/each}
</tr>
{/each}