Transform messages

This commit is contained in:
Johannes Zillmann 2021-02-09 22:48:56 +01:00
parent 4401f1fb5c
commit a98a862a58
9 changed files with 92 additions and 72 deletions

View File

@ -1,7 +1,7 @@
import { assert } from './assert'; import { assert } from './assert';
import Item from './Item'; import Item from './Item';
import ItemResult from './ItemResult';
import ItemTransformer from './transformer/ItemTransformer'; import ItemTransformer from './transformer/ItemTransformer';
import ParseResult from './ParseResult';
import { calculateSchemas } from './transformer/transformerUtil'; import { calculateSchemas } from './transformer/transformerUtil';
import TransformContext from './transformer/TransformContext'; import TransformContext from './transformer/TransformContext';
@ -11,7 +11,7 @@ export default class Debugger {
transformers: ItemTransformer[]; transformers: ItemTransformer[];
stageNames: string[]; stageNames: string[];
stageSchema: string[][]; stageSchema: string[][];
private stageItems: Item[][]; private stageResultCache: ItemResult[];
constructor( constructor(
initialSchema: string[], initialSchema: string[],
@ -23,19 +23,21 @@ export default class Debugger {
this.transformers = transformers; this.transformers = transformers;
this.context = context; this.context = context;
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)]; this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
this.stageItems = [initialItems]; this.stageResultCache = [{ items: initialItems, messages: [`Parsed ${initialItems[initialItems.length-1].page+1} pages with ${initialItems.length} items`] }];
this.stageSchema = calculateSchemas(initialSchema, transformers); this.stageSchema = calculateSchemas(initialSchema, transformers);
} }
//TODO return MarkedItem ? (removed, added, etc..)? //TODO return MarkedItem ? (removed, added, etc..)?
//TODO StageResult == class with schema and marked items ? //TODO StageResult == class with schema and marked items ?
stageResults(stageIndex: number): Item[] { stageResults(stageIndex: number): ItemResult {
for (let idx = 0; idx < stageIndex + 1; idx++) { for (let idx = 0; idx < stageIndex + 1; idx++) {
if (!this.stageItems[idx]) { if (!this.stageResultCache[idx]) {
const stageItems = this.transformers[idx - 1].transform(this.context, this.stageItems[idx - 1]); const stageResult = this.transformers[idx - 1].transform(this.context, [
this.stageItems.push(stageItems); ...this.stageResultCache[idx - 1].items,
]);
this.stageResultCache.push(stageResult);
} }
} }
return this.stageItems[stageIndex]; return this.stageResultCache[stageIndex];
} }
} }

6
core/src/ItemResult.ts Normal file
View File

@ -0,0 +1,6 @@
import type Item from './Item';
export default interface ItemResult {
items: Item[];
messages: string[];
}

View File

@ -2,8 +2,6 @@ import Item from './Item';
import Metadata from './Metadata'; import Metadata from './Metadata';
import type ParseReporter from './ParseReporter'; import type ParseReporter from './ParseReporter';
import ParseResult from './ParseResult'; import ParseResult from './ParseResult';
import TextDirection from './TextDirection';
import type TextItem from './TextItem';
/** /**
* Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS. * Parses a PDF via PDFJS and returns a ParseResult which contains more or less the original data from PDFJS.
@ -86,59 +84,59 @@ export default class PdfParser {
return page.getOperatorList(); return page.getOperatorList();
} }
async parseOld(data: Uint8Array): Promise<ParseResult> { // async parseOld(data: Uint8Array): Promise<ParseResult> {
return this.pdfjs // return this.pdfjs
.getDocument({ // .getDocument({
data, // data,
cMapUrl: 'cmaps/', // cMapUrl: 'cmaps/',
cMapPacked: true, // cMapPacked: true,
}) // })
.promise.then((pdfDocument) => { // .promise.then((pdfDocument) => {
// console.log('result', pdfDocument); // // console.log('result', pdfDocument);
const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => { // const result = [...Array(pdfDocument.numPages)].reduce((accumulatorPromise, _, index) => {
return accumulatorPromise.then((accumulatedResults) => { // return accumulatorPromise.then((accumulatedResults) => {
// console.log('Parsing page ' + index); // // console.log('Parsing page ' + index);
return pdfDocument.getPage(index + 1).then((page) => { // return pdfDocument.getPage(index + 1).then((page) => {
const viewport = page.getViewport({ scale: 1.0 }); // const viewport = page.getViewport({ scale: 1.0 });
console.log(viewport); // console.log(viewport);
return this.triggerFontRetrieval(page).then(() => // return this.triggerFontRetrieval(page).then(() =>
page.getTextContent().then((textContent) => { // page.getTextContent().then((textContent) => {
// console.log(textContent); // // console.log(textContent);
const textItems: TextItem[] = textContent.items.map((item) => { // const textItems: TextItem[] = textContent.items.map((item) => {
const tx = this.pdfjs.Util.transform(viewport.transform, item.transform); // const tx = this.pdfjs.Util.transform(viewport.transform, item.transform);
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); // const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
const dividedHeight = item.height / fontHeight; // const dividedHeight = item.height / fontHeight;
return { // return {
x: Math.round(item.transform[4]), // x: Math.round(item.transform[4]),
y: Math.round(item.transform[5]), // y: Math.round(item.transform[5]),
width: Math.round(item.width), // width: Math.round(item.width),
height: Math.round( // height: Math.round(
Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight, // Number.isNaN(dividedHeight) || dividedHeight <= 1 ? item.height : dividedHeight,
), // ),
text: item.str, // text: item.str,
textDirection: TextDirection.fromPdfJs(item.dir), // textDirection: TextDirection.fromPdfJs(item.dir),
fontId: item.fontName, // fontId: item.fontName,
}; // };
}); // });
return [...accumulatedResults, ...textItems]; // return [...accumulatedResults, ...textItems];
}), // }),
); // );
}); // });
}); // });
}, Promise.resolve([])); // }, Promise.resolve([]));
return Promise.all([pdfDocument.getMetadata(), result]); // return Promise.all([pdfDocument.getMetadata(), result]);
}) // })
.then(([metadata, r]) => { // .then(([metadata, r]) => {
// console.log('Parsed metadata:', metadata); // // console.log('Parsed metadata:', metadata);
// console.log('Parsed result:', r.length); // // console.log('Parsed result:', r.length);
// console.log('Parsed result:', r); // // console.log('Parsed result:', r);
return {}; // return {};
}); // });
} // }
} }
interface ParsedPage { interface ParsedPage {

View File

@ -1,5 +1,6 @@
import PageViewport from 'src/parse/PageViewport'; import PageViewport from 'src/parse/PageViewport';
import Item from '../Item'; import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer'; import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext'; import TransformContext from './TransformContext';
@ -10,11 +11,12 @@ export default class AdjustHeight extends ItemTransformer {
}); });
} }
transform(context: TransformContext, items: Item[]): Item[] { transform(context: TransformContext, items: Item[]): ItemResult {
const newItems: Item[] = []; const newItems: Item[] = [];
let page = -1; let page = -1;
let pageViewport: PageViewport; let pageViewport: PageViewport;
//TODO groupBy page //TODO groupBy page
let correctedHeights = 0;
items.forEach((item) => { items.forEach((item) => {
if (item.page !== page) { if (item.page !== page) {
pageViewport = context.pageViewports[item.page]; pageViewport = context.pageViewports[item.page];
@ -27,11 +29,12 @@ export default class AdjustHeight extends ItemTransformer {
const dividedHeight = itemHeight / fontHeight; const dividedHeight = itemHeight / fontHeight;
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight; const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
if (newHeight !== itemHeight) { if (newHeight !== itemHeight) {
correctedHeights++;
newItems.push(item.withDataAddition({ height: newHeight })); newItems.push(item.withDataAddition({ height: newHeight }));
} else { } else {
newItems.push(item); newItems.push(item);
} }
}); });
return items; return { items, messages: [`${correctedHeights} corrected heights`] };
} }
} }

View File

@ -1,4 +1,5 @@
import Item from '../Item'; import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer'; import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext'; import TransformContext from './TransformContext';
@ -11,9 +12,12 @@ export default class CalculateCoordinates extends ItemTransformer {
}); });
} }
transform(context: TransformContext, items: Item[]): Item[] { transform(context: TransformContext, items: Item[]): ItemResult {
// const transform: number[] = item.value['Transform']; // const transform: number[] = item.value['Transform'];
items.shift(); items.shift();
return items; if(items[0]){
items[0].data['fontName']='xxx';
}
return { items, messages: [] };
} }
} }

View File

@ -1,6 +1,7 @@
import TransformerDescription from '../TransformerDescription'; import TransformerDescription from '../TransformerDescription';
import type Item from '../Item'; import type Item from '../Item';
import TransformContext from './TransformContext'; import TransformContext from './TransformContext';
import ItemResult from 'src/ItemResult';
export default abstract class ItemTransformer { export default abstract class ItemTransformer {
readonly name: string; readonly name: string;
@ -21,5 +22,5 @@ export default abstract class ItemTransformer {
} }
// columnar-changes: described // columnar-changes: described
abstract transform(context: TransformContext, items: Item[]): Item[]; abstract transform(context: TransformContext, items: Item[]): ItemResult;
} }

View File

@ -6,7 +6,7 @@
</script> </script>
<div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div> <div class="text-2xl font-semibold font-serif text-center bg-gray-400">PDF to Markdown Converter</div>
<main class="mt-5 h-full"> <main class="mt-2 h-full">
{#if $debug} {#if $debug}
<DebugView debug={$debug} /> <DebugView debug={$debug} />
{:else} {:else}

View File

@ -14,12 +14,12 @@
$: canNext = currentStage + 1 < stageNames.length; $: canNext = currentStage + 1 < stageNames.length;
$: canPrev = currentStage > 0; $: canPrev = currentStage > 0;
$: stageSchema = debug.stageSchema[currentStage]; $: stageSchema = debug.stageSchema[currentStage];
$: stageItems = debug.stageResults(currentStage); $: stageResult = debug.stageResults(currentStage);
$: pageFocus = !isNaN(focusedPage); $: pageFocus = !isNaN(focusedPage);
$: pagesNumbers = new Set(stageItems.map((item) => item.page)); $: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
$: maxPage = Math.max(...pagesNumbers); $: maxPage = Math.max(...pagesNumbers);
$: itemsByPage = [ $: itemsByPage = [
...stageItems.reduce((map, item) => { ...stageResult.items.reduce((map, item) => {
if (!map.has(item.page)) { if (!map.has(item.page)) {
map.set(item.page, []); map.set(item.page, []);
} }
@ -41,11 +41,9 @@
</script> </script>
<div class="mx-4"> <div class="mx-4">
<div class="mb-4">
<!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div> <!-- <div>Parsed {parseResult.pageCount()} pages with {parseResult.items.length} items</div>
<div>Title: {parseResult.metadata.title()}</div> <div>Title: {parseResult.metadata.title()}</div>
<div>Author: {parseResult.metadata.author()}</div> --> <div>Author: {parseResult.metadata.author()}</div> -->
</div>
<!-- Sticky Controls --> <!-- Sticky Controls -->
<div class="controls py-2"> <div class="controls py-2">
@ -89,10 +87,18 @@
<span on:click={() => canNext && currentStage++}> <span on:click={() => canNext && currentStage++}>
<ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} /> <ArrowRight size="1x" class={canNext ? 'hover:text-green-700 cursor-pointer' : 'opacity-50'} />
</span> </span>
<div>{stageNames[currentStage]}</div> <div class="cursor-pointer hover:underline">{stageNames[currentStage]}</div>
</div> </div>
</div> </div>
<!-- Stage Messages -->
<ul class="list-disc list-inside mb-2 p-2 bg-yellow-100 rounded shadow text-sm">
{#each stageResult.messages as message}
<li>{message}</li>
{/each}
</ul>
<!-- Items -->
<ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} /> <ItemTable schema={stageSchema} itemsByPage={visiblePages} {maxPage} {pageFocus} />
</div> </div>

View File

@ -51,7 +51,7 @@
{/if} {/if}
<td>{itemIdx}</td> <td>{itemIdx}</td>
{#each schema as column} {#each schema as column}
<td>{format(item.data[column])}</td> <td class="select-all">{format(item.data[column])}</td>
{/each} {/each}
</tr> </tr>
{/each} {/each}