mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-01 03:09:01 +01:00
Track evaluations
This commit is contained in:
parent
c98145a63c
commit
0be95e4bbc
@ -6,28 +6,25 @@ import ColumnAnnotation from './debug/ColumnAnnotation';
|
||||
import AnnotatedColumn from './debug/AnnotatedColumn';
|
||||
import { detectChanges } from './debug/detectChanges';
|
||||
import { asPages } from './debug/Page';
|
||||
import EvaluationTracker from './transformer/EvaluationTracker';
|
||||
import ChangeTracker from './debug/ChangeTracker';
|
||||
import PageViewport from './parse/PageViewport';
|
||||
|
||||
export default class Debugger {
|
||||
private context: TransformContext;
|
||||
private transformers: ItemTransformer[];
|
||||
private stageResultCache: StageResult[];
|
||||
pageCount: number;
|
||||
fontMap: Map<string, object>;
|
||||
stageNames: string[];
|
||||
stageDescriptions: string[];
|
||||
|
||||
constructor(
|
||||
pageCount: number,
|
||||
public fontMap: Map<string, object>,
|
||||
private pageViewports: PageViewport[],
|
||||
public pageCount: number,
|
||||
inputSchema: string[],
|
||||
inputItems: Item[],
|
||||
context: TransformContext,
|
||||
transformers: ItemTransformer[],
|
||||
) {
|
||||
this.transformers = transformers;
|
||||
this.context = context;
|
||||
this.fontMap = context.fontMap;
|
||||
this.pageCount = pageCount;
|
||||
this.stageNames = ['Parse Result', ...transformers.map((t) => t.name)];
|
||||
this.stageDescriptions = ['Initial items as parsed by PDFjs', ...transformers.map((t) => t.description)];
|
||||
this.stageResultCache = [initialStage(inputSchema, inputItems)];
|
||||
@ -36,19 +33,21 @@ export default class Debugger {
|
||||
stageResult(stageIndex: number): StageResult {
|
||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||
if (!this.stageResultCache[idx]) {
|
||||
const evaluations = new EvaluationTracker();
|
||||
const context = new TransformContext(this.fontMap, this.pageViewports, evaluations);
|
||||
const transformer = this.transformers[idx - 1];
|
||||
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
|
||||
const previousItems = previousStageResult.itemsCleanedAndUnpacked();
|
||||
const inputSchema = toSimpleSchema(previousStageResult);
|
||||
const outputSchema = transformer.schemaTransformer(inputSchema);
|
||||
const itemResult = transformer.transform(this.context, [...previousItems]);
|
||||
const itemResult = transformer.transform(context, [...previousItems]);
|
||||
|
||||
const changeTracker = new ChangeTracker();
|
||||
const items = detectChanges(changeTracker, previousItems, itemResult.items);
|
||||
const pages = asPages(changeTracker, items, transformer.descriptor.debug?.itemMerger);
|
||||
const changes = new ChangeTracker();
|
||||
const items = detectChanges(changes, previousItems, itemResult.items);
|
||||
const pages = asPages(evaluations, changes, items, transformer.descriptor.debug?.itemMerger);
|
||||
const messages = itemResult.messages;
|
||||
if (changeTracker.changeCount() > 0 && messages.length === 0) {
|
||||
messages.unshift(`Detected ${changeTracker.changeCount()} changes`);
|
||||
if (changes.changeCount() > 0 && messages.length === 0) {
|
||||
messages.unshift(`Detected ${changes.changeCount()} changes`);
|
||||
}
|
||||
|
||||
this.stageResultCache.push(
|
||||
@ -56,7 +55,8 @@ export default class Debugger {
|
||||
transformer.descriptor,
|
||||
toAnnotatedSchema(inputSchema, outputSchema),
|
||||
pages,
|
||||
changeTracker,
|
||||
evaluations,
|
||||
changes,
|
||||
messages,
|
||||
),
|
||||
);
|
||||
|
@ -5,6 +5,7 @@ import ItemTransformer from './transformer/ItemTransformer';
|
||||
import ParseResult from './ParseResult';
|
||||
import Debugger from './Debugger';
|
||||
import { assert } from './assert';
|
||||
import TransformContext from './transformer/TransformContext';
|
||||
|
||||
export default class PdfPipeline {
|
||||
parser: PdfParser;
|
||||
@ -27,9 +28,9 @@ export default class PdfPipeline {
|
||||
async execute(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<ParseResult> {
|
||||
const parseResult = await this.parse(src, progressListener);
|
||||
this.verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
|
||||
let items = parseResult.items;
|
||||
this.transformers.forEach((transformer) => {
|
||||
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports);
|
||||
items = transformer.transform(context, items).items;
|
||||
});
|
||||
parseResult.items = items;
|
||||
@ -38,8 +39,14 @@ export default class PdfPipeline {
|
||||
|
||||
async debug(src: string | Uint8Array | object, progressListener: ProgressListenFunction): Promise<Debugger> {
|
||||
const parseResult = await this.parse(src, progressListener);
|
||||
const context = { fontMap: parseResult.fontMap, pageViewports: parseResult.pageViewports };
|
||||
return new Debugger(parseResult.pageCount, parseResult.schema, parseResult.items, context, this.transformers);
|
||||
return new Debugger(
|
||||
parseResult.fontMap,
|
||||
parseResult.pageViewports,
|
||||
parseResult.pageCount,
|
||||
parseResult.schema,
|
||||
parseResult.items,
|
||||
this.transformers,
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -1,3 +1,4 @@
|
||||
import type EvaluationTracker from '../transformer/EvaluationTracker';
|
||||
import type ChangeTracker from './ChangeTracker';
|
||||
import type Item from '../Item';
|
||||
|
||||
@ -6,5 +7,5 @@ import type Item from '../Item';
|
||||
*/
|
||||
export default abstract class ItemMerger {
|
||||
constructor(public groupKey: string) {}
|
||||
abstract merge(tracker: ChangeTracker, items: Item[]): Item;
|
||||
abstract merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item;
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
import ItemMerger from './ItemMerger';
|
||||
import Item from '../Item';
|
||||
import EvaluationTracker from '../transformer/EvaluationTracker';
|
||||
import ChangeTracker from './ChangeTracker';
|
||||
|
||||
export default class LineItemMerger extends ItemMerger {
|
||||
@ -7,7 +8,7 @@ export default class LineItemMerger extends ItemMerger {
|
||||
super('line');
|
||||
}
|
||||
|
||||
merge(tracker: ChangeTracker, items: Item[]): Item {
|
||||
merge(evaluationTracker: EvaluationTracker, changeTracker: ChangeTracker, items: Item[]): Item {
|
||||
const page = items[0].page;
|
||||
const line = items[0].data['line'];
|
||||
const str = items.map((item) => item.data['str']).join(' ');
|
||||
@ -28,12 +29,14 @@ export default class LineItemMerger extends ItemMerger {
|
||||
dir: directions,
|
||||
});
|
||||
|
||||
if (items.find((item) => evaluationTracker.evaluated(item))) evaluationTracker.trackEvaluation(newItem);
|
||||
|
||||
if (this.trackAsNew) {
|
||||
tracker.trackAddition(newItem);
|
||||
} else if (items.every((item) => tracker.isRemoved(item))) {
|
||||
tracker.trackRemoval(newItem);
|
||||
} else if (items.find((item) => tracker.hasChanged(item))) {
|
||||
tracker.trackContentChange(newItem);
|
||||
changeTracker.trackAddition(newItem);
|
||||
} else if (items.every((item) => changeTracker.isRemoved(item))) {
|
||||
changeTracker.trackRemoval(newItem);
|
||||
} else if (items.find((item) => changeTracker.hasChanged(item))) {
|
||||
changeTracker.trackContentChange(newItem);
|
||||
}
|
||||
return newItem;
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
import Item from '../Item';
|
||||
import { groupByElement, groupByPage } from '../support/groupingUtils';
|
||||
import EvaluationTracker from '../transformer/EvaluationTracker';
|
||||
import ChangeTracker from './ChangeTracker';
|
||||
import ItemGroup from './ItemGroup';
|
||||
import ItemMerger from './ItemMerger';
|
||||
@ -9,13 +10,18 @@ export default interface Page {
|
||||
itemGroups: ItemGroup[];
|
||||
}
|
||||
|
||||
export function asPages(tracker: ChangeTracker, items: Item[], itemMerger?: ItemMerger): Page[] {
|
||||
export function asPages(
|
||||
evaluationTracker: EvaluationTracker,
|
||||
changeTracker: ChangeTracker,
|
||||
items: Item[],
|
||||
itemMerger?: ItemMerger,
|
||||
): Page[] {
|
||||
return groupByPage(items).map((pageItems: Item[]) => {
|
||||
let itemGroups: ItemGroup[];
|
||||
if (itemMerger) {
|
||||
itemGroups = groupByElement(pageItems, itemMerger.groupKey).map((groupItems) => {
|
||||
if (groupItems.length > 1) {
|
||||
const top = itemMerger.merge(tracker, groupItems);
|
||||
const top = itemMerger.merge(evaluationTracker, changeTracker, groupItems);
|
||||
return new ItemGroup(top, groupItems);
|
||||
} else {
|
||||
return new ItemGroup(groupItems[0]);
|
||||
|
@ -5,12 +5,15 @@ import Page, { asPages } from './Page';
|
||||
import ChangeIndex from './ChangeIndex';
|
||||
import ChangeTracker from './ChangeTracker';
|
||||
import ItemGroup from './ItemGroup';
|
||||
import EvaluationIndex from '../transformer/EvaluationIndex';
|
||||
import EvaluationTracker from '../transformer/EvaluationTracker';
|
||||
|
||||
export default class StageResult {
|
||||
constructor(
|
||||
public descriptor: TransformDescriptor,
|
||||
public schema: AnnotatedColumn[],
|
||||
public pages: Page[],
|
||||
public evaluations: EvaluationIndex,
|
||||
public changes: ChangeIndex,
|
||||
public messages: string[],
|
||||
) {}
|
||||
@ -50,7 +53,9 @@ export default class StageResult {
|
||||
(page) =>
|
||||
({
|
||||
...page,
|
||||
itemGroups: page.itemGroups.filter((itemGroup) => this.changes.hasChanged(itemGroup.top)),
|
||||
itemGroups: page.itemGroups.filter(
|
||||
(itemGroup) => this.evaluations.evaluated(itemGroup.top) || this.changes.hasChanged(itemGroup.top),
|
||||
),
|
||||
} as Page),
|
||||
);
|
||||
}
|
||||
@ -73,12 +78,13 @@ export default class StageResult {
|
||||
|
||||
export function initialStage(inputSchema: string[], inputItems: Item[]): StageResult {
|
||||
const schema = inputSchema.map((column) => ({ name: column }));
|
||||
const tracker = new ChangeTracker();
|
||||
const pages = asPages(tracker, inputItems);
|
||||
const evaluations = new EvaluationTracker();
|
||||
const changes = new ChangeTracker();
|
||||
const pages = asPages(evaluations, changes, inputItems);
|
||||
const messages = [
|
||||
`Parsed ${inputItems.length === 0 ? 0 : inputItems[inputItems.length - 1].page + 1} pages with ${
|
||||
inputItems.length
|
||||
} items`,
|
||||
];
|
||||
return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, tracker, messages);
|
||||
return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, evaluations, changes, messages);
|
||||
}
|
||||
|
@ -8,6 +8,10 @@ export function onlyUniques<T>(value: T, index: number, self: T[]) {
|
||||
return self.indexOf(value) === index;
|
||||
}
|
||||
|
||||
export function ascending<T>(a: number, b: number): number {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
export function count<T, S>(array: T[], find: (entry: T) => boolean): number {
|
||||
return array.reduce((count, entry) => (find(entry) ? count + 1 : count), 0);
|
||||
}
|
||||
|
14
core/src/transformer/EvaluationIndex.ts
Normal file
14
core/src/transformer/EvaluationIndex.ts
Normal file
@ -0,0 +1,14 @@
|
||||
import Item from '../Item';
|
||||
|
||||
export default interface EvaluationIndex {
|
||||
/**
|
||||
* Return the number of tracked evaluations.
|
||||
*/
|
||||
evaluationCount(): number;
|
||||
|
||||
/**
|
||||
* Returns the true if the given item has been evaluated
|
||||
* @param item
|
||||
*/
|
||||
evaluated(item: Item): boolean;
|
||||
}
|
23
core/src/transformer/EvaluationTracker.ts
Normal file
23
core/src/transformer/EvaluationTracker.ts
Normal file
@ -0,0 +1,23 @@
|
||||
import { assertDefined } from '../assert';
|
||||
import Item from '../Item';
|
||||
import EvaluationIndex from './EvaluationIndex';
|
||||
|
||||
export default class EvaluationTracker implements EvaluationIndex {
|
||||
private evaluations: Set<string> = new Set();
|
||||
|
||||
evaluationCount() {
|
||||
return this.evaluations.size;
|
||||
}
|
||||
|
||||
evaluated(item: Item) {
|
||||
return this.evaluations.has(_uuid(item));
|
||||
}
|
||||
|
||||
trackEvaluation(item: Item) {
|
||||
this.evaluations.add(_uuid(item));
|
||||
}
|
||||
}
|
||||
|
||||
function _uuid(item: Item): string {
|
||||
return assertDefined(item.uuid, 'UUID is not set');
|
||||
}
|
@ -6,6 +6,7 @@ import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import LineItemMerger from '../debug/LineItemMerger';
|
||||
import {
|
||||
ascending,
|
||||
flatMap,
|
||||
groupByLine,
|
||||
groupByPage,
|
||||
@ -43,14 +44,14 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
||||
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
||||
const pageExtracts = buildExtracts(inputItems);
|
||||
|
||||
const uniqueYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
|
||||
const fringeYs = flatMap(pageExtracts, (extract) => extract.fringeLines)
|
||||
.map((line) => line.y)
|
||||
.filter(onlyUniques)
|
||||
.sort((a, b) => a - b);
|
||||
.sort(ascending);
|
||||
|
||||
// console.log('uniqueYs', uniqueYs);
|
||||
|
||||
const yToRemove = uniqueYs.filter((y) => {
|
||||
const yToRemove = fringeYs.filter((y) => {
|
||||
const yLines = pageExtracts
|
||||
.map((page) => page.lineByY(y))
|
||||
.filter((line) => typeof line !== 'undefined') as Line[];
|
||||
@ -73,11 +74,20 @@ export default class RemoveRepetitiveItems extends ItemTransformer {
|
||||
|
||||
//console.log('yToRemove', yToRemove);
|
||||
|
||||
let removalCount = 0;
|
||||
return {
|
||||
items: transformGroupedByPageAndLine(inputItems, (_, __, items) =>
|
||||
yToRemove.includes(yFromLine(items)) ? [] : items,
|
||||
),
|
||||
messages: [`Filtered out each item with y == ${yToRemove.join('||')}`],
|
||||
items: transformGroupedByPageAndLine(inputItems, (_, __, lineItems) => {
|
||||
const itemsY = yFromLine(lineItems);
|
||||
if (fringeYs.includes(itemsY)) {
|
||||
lineItems.forEach(context.trackEvaluation.bind(context));
|
||||
}
|
||||
if (yToRemove.includes(itemsY)) {
|
||||
removalCount++;
|
||||
return [];
|
||||
}
|
||||
return lineItems;
|
||||
}),
|
||||
messages: [`Filtered out ${removalCount} items with y == ${yToRemove.join('||')}`],
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -166,14 +176,14 @@ class Line {
|
||||
|
||||
constructor(public y: number, public items: Item[]) {}
|
||||
|
||||
text() {
|
||||
text(): string {
|
||||
if (!this._text) {
|
||||
this._text = this.items.reduce((all, item) => all + item.data['str'], '');
|
||||
}
|
||||
return this._text;
|
||||
}
|
||||
|
||||
textWithoutNumbers() {
|
||||
textWithoutNumbers(): string {
|
||||
if (!this._textWithoutNumbers) {
|
||||
this._textWithoutNumbers = filterOutDigits(this.text());
|
||||
}
|
||||
|
@ -1,6 +1,15 @@
|
||||
import Item from '../Item';
|
||||
import PageViewport from '../parse/PageViewport';
|
||||
import EvaluationTracker from './EvaluationTracker';
|
||||
|
||||
export default interface TransformContext {
|
||||
fontMap: Map<string, object>;
|
||||
pageViewports: PageViewport[];
|
||||
export default class TransformContext {
|
||||
constructor(
|
||||
public fontMap: Map<string, object>,
|
||||
public pageViewports: PageViewport[],
|
||||
private evaluations = new EvaluationTracker(),
|
||||
) {}
|
||||
|
||||
trackEvaluation(item: Item) {
|
||||
this.evaluations.trackEvaluation(item);
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +1,5 @@
|
||||
<script>
|
||||
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
|
||||
import type ChangeIndex from '@core/debug/ChangeIndex';
|
||||
import type Item from '@core/Item';
|
||||
import { Addition, Removal, ContentChange, PositionChange, Direction } from '../../../core/src/debug/ChangeIndex';
|
||||
@ -10,17 +11,20 @@
|
||||
MinusCircle as Minus,
|
||||
ArrowCircleUp as Up,
|
||||
ArrowCircleDown as Down,
|
||||
Eye,
|
||||
} from 'svelte-hero-icons';
|
||||
|
||||
export let evaluations: EvaluationIndex;
|
||||
export let changes: ChangeIndex;
|
||||
export let item: Item;
|
||||
|
||||
$: evaluated = evaluations.evaluated(item);
|
||||
$: hasChanged = changes.hasChanged(item);
|
||||
let changeContent: string;
|
||||
let iconComp: ComponentDefinition;
|
||||
$: {
|
||||
let args = { size: '14' };
|
||||
if (hasChanged) {
|
||||
let args = { size: '14' };
|
||||
let change = changes.change(item);
|
||||
switch (change.constructor.name) {
|
||||
case PositionChange.name:
|
||||
@ -43,11 +47,13 @@
|
||||
default:
|
||||
throw new Error(`${change.constructor.name}: ${change}`);
|
||||
}
|
||||
} else if (evaluated) {
|
||||
iconComp = new ComponentDefinition(Eye, args);
|
||||
}
|
||||
}
|
||||
</script>
|
||||
|
||||
{#if hasChanged}
|
||||
{#if evaluated || hasChanged}
|
||||
<div class="flex space-x-0.5 items-center text-xs">
|
||||
{#if iconComp}
|
||||
<svelte:component this={iconComp.component} {...iconComp.args} />
|
||||
|
@ -49,7 +49,12 @@
|
||||
|
||||
<!-- Items -->
|
||||
{#if visiblePages.find((page) => page.itemGroups.length > 0)}
|
||||
<ItemTable schema={stageResult.schema} pages={visiblePages} {pageControl} changes={stageResult.changes} />
|
||||
<ItemTable
|
||||
schema={stageResult.schema}
|
||||
pages={visiblePages}
|
||||
{pageControl}
|
||||
evaluations={stageResult.evaluations}
|
||||
changes={stageResult.changes} />
|
||||
{:else}
|
||||
<!-- No items visible -->
|
||||
<div class="flex mt-8">
|
||||
|
@ -4,6 +4,7 @@
|
||||
import { ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
||||
|
||||
import type ItemGroup from '@core/debug/ItemGroup';
|
||||
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
|
||||
import type ChangeIndex from '@core/debug/ChangeIndex';
|
||||
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
|
||||
|
||||
@ -17,6 +18,7 @@
|
||||
export let itemIdx: number;
|
||||
export let schema: AnnotatedColumn[];
|
||||
export let itemGroup: ItemGroup;
|
||||
export let evaluations: EvaluationIndex;
|
||||
export let changes: ChangeIndex;
|
||||
|
||||
let expandedItemGroup: { pageIndex: number; itemIndex: number };
|
||||
@ -45,7 +47,7 @@
|
||||
<td id="page" />
|
||||
{/if}
|
||||
<td class="align-middle">
|
||||
<ChangeSymbol {changes} item={itemGroup.top} />
|
||||
<ChangeSymbol {evaluations} {changes} item={itemGroup.top} />
|
||||
</td>
|
||||
<span class="contents" on:click={() => itemGroup.hasMany() && toggleRow(pageIdx, itemIdx)}>
|
||||
<!-- ID & change marker column -->
|
||||
@ -72,7 +74,7 @@
|
||||
class:changeMinus={changes.isMinusChange(child)}>
|
||||
<td id="page" />
|
||||
<td class="align-baseline">
|
||||
<ChangeSymbol {changes} item={child} />
|
||||
<ChangeSymbol {evaluations} {changes} item={child} />
|
||||
</td>
|
||||
<td class="whitespace-nowrap">
|
||||
<div class="flex space-x-1">
|
||||
|
@ -4,6 +4,7 @@
|
||||
import { PresentationChartLine } from 'svelte-hero-icons';
|
||||
|
||||
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
|
||||
import type EvaluationIndex from '@core/transformer/EvaluationIndex';
|
||||
import type ChangeIndex from '@core/debug/ChangeIndex';
|
||||
import type Page from '@core/debug/Page';
|
||||
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
|
||||
@ -15,6 +16,7 @@
|
||||
export let schema: AnnotatedColumn[];
|
||||
export let pages: Page[];
|
||||
export let pageControl: PageControl;
|
||||
export let evaluations: EvaluationIndex;
|
||||
export let changes: ChangeIndex;
|
||||
|
||||
let { pagePinned } = pageControl;
|
||||
@ -80,7 +82,7 @@
|
||||
|
||||
<!-- Page items -->
|
||||
{#each page.itemGroups as itemGroup, itemIdx}
|
||||
<ItemRow pageIdx={page.index} {itemIdx} {schema} {itemGroup} {changes} {pageControl} />
|
||||
<ItemRow pageIdx={page.index} {itemIdx} {schema} {itemGroup} {evaluations} {changes} {pageControl} />
|
||||
{/each}
|
||||
{/each}
|
||||
</tbody>
|
||||
|
Loading…
Reference in New Issue
Block a user