mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-17 02:58:30 +01:00
Improve change detection to handle removal case properly
This commit is contained in:
parent
8e024ee544
commit
77b7d837eb
@ -38,14 +38,14 @@ export default class Debugger {
|
||||
if (!this.stageResultCache[idx]) {
|
||||
const transformer = this.transformers[idx - 1];
|
||||
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
|
||||
const previousItems = previousStageResult.itemsUnpacked();
|
||||
const previousItems = previousStageResult.itemsCleanedAndUnpacked();
|
||||
const inputSchema = toSimpleSchema(previousStageResult);
|
||||
const outputSchema = transformer.schemaTransformer(inputSchema);
|
||||
const itemResult = transformer.transform(this.context, [...previousItems]);
|
||||
|
||||
const changeTracker = new ChangeTracker();
|
||||
detectChanges(changeTracker, previousItems, itemResult.items);
|
||||
const pages = asPages(changeTracker, itemResult.items, transformer.descriptor.debug?.itemMerger);
|
||||
const items = detectChanges(changeTracker, previousItems, itemResult.items);
|
||||
const pages = asPages(changeTracker, items, transformer.descriptor.debug?.itemMerger);
|
||||
const messages = itemResult.messages;
|
||||
if (changeTracker.changeCount() > 0) {
|
||||
messages.unshift(`Detected ${changeTracker.changeCount()} changes`);
|
||||
|
@ -35,6 +35,12 @@ export default interface ChangeIndex {
|
||||
* @param item
|
||||
*/
|
||||
isMinusChange(item: Item): boolean;
|
||||
|
||||
/**
|
||||
* Returns true if the item was removed.
|
||||
* @param item
|
||||
*/
|
||||
isRemoved(item: Item): boolean;
|
||||
}
|
||||
|
||||
export abstract class Change {
|
||||
|
@ -19,7 +19,12 @@ export default class ChangeTracker implements ChangeIndex {
|
||||
|
||||
private addChange(item: Item, change: Change) {
|
||||
const uuid = _uuid(item);
|
||||
assertNot(this.changes.has(uuid), `Change for item ${uuid} already defined`);
|
||||
assertNot(
|
||||
this.changes.has(uuid),
|
||||
`Change for item ${uuid} already defined! (old: ${JSON.stringify(this.changes.get(uuid))}, new: ${JSON.stringify(
|
||||
change,
|
||||
)})`,
|
||||
);
|
||||
this.changes.set(uuid, change);
|
||||
}
|
||||
|
||||
@ -38,7 +43,9 @@ export default class ChangeTracker implements ChangeIndex {
|
||||
trackPositionalChange(item: Item, oldPosition: number, newPosition: number) {
|
||||
const direction = newPosition > oldPosition ? Direction.DOWN : Direction.UP;
|
||||
const amount = Math.abs(newPosition - oldPosition);
|
||||
this.addChange(item, new PositionChange(direction, amount));
|
||||
if (amount > 0) {
|
||||
this.addChange(item, new PositionChange(direction, amount));
|
||||
}
|
||||
}
|
||||
|
||||
trackContentChange(item: Item) {
|
||||
@ -64,6 +71,10 @@ export default class ChangeTracker implements ChangeIndex {
|
||||
isMinusChange(item: Item): boolean {
|
||||
return this.change(item)?.category === ChangeCategory.MINUS;
|
||||
}
|
||||
|
||||
isRemoved(item: Item): boolean {
|
||||
return this.change(item)?.constructor.name === REMOVAL.constructor.name;
|
||||
}
|
||||
}
|
||||
|
||||
function _uuid(item: Item): string {
|
||||
|
@ -30,6 +30,8 @@ export default class LineItemMerger extends ItemMerger {
|
||||
|
||||
if (this.trackAsNew) {
|
||||
tracker.trackAddition(newItem);
|
||||
} else if (items.every((item) => tracker.isRemoved(item))) {
|
||||
tracker.trackRemoval(newItem);
|
||||
} else if (items.find((item) => tracker.hasChanged(item))) {
|
||||
tracker.trackContentChange(newItem);
|
||||
}
|
||||
|
@ -22,6 +22,18 @@ export default class StageResult {
|
||||
}, []);
|
||||
}
|
||||
|
||||
itemsCleanedAndUnpacked(): Item[] {
|
||||
return this.pages.reduce((items: Item[], page: Page) => {
|
||||
page.itemGroups.forEach((itemGroup) =>
|
||||
itemGroup
|
||||
.unpacked()
|
||||
.filter((item) => !this.changes.isRemoved(item))
|
||||
.forEach((item) => items.push(item)),
|
||||
);
|
||||
return items;
|
||||
}, []);
|
||||
}
|
||||
|
||||
selectPages(relevantChangesOnly: boolean, groupItems: boolean): Page[] {
|
||||
let result: Page[];
|
||||
|
||||
|
@ -1,38 +1,80 @@
|
||||
import ChangeTracker from './ChangeTracker';
|
||||
import { assertDefined } from '../assert';
|
||||
import Item from '../Item';
|
||||
import { groupByPage } from '../support/groupingUtils';
|
||||
|
||||
/**
|
||||
* Compares incomming and outgoing items of a transformer in order to detect changes and to display them in any debug visualization.
|
||||
* Note: ItemMerger registers changes as well.
|
||||
*/
|
||||
export function detectChanges(tracker: ChangeTracker, inputItems: Item[], transformedItems: Item[]) {
|
||||
const inputItemsByUuid = inputMap(inputItems);
|
||||
transformedItems.forEach((item, idx) => {
|
||||
const uuid = _uuid(item);
|
||||
const oldItem = inputItemsByUuid.get(uuid);
|
||||
if (oldItem) {
|
||||
if (idx !== oldItem.position) {
|
||||
tracker.trackPositionalChange(item, oldItem.position, idx);
|
||||
}
|
||||
//TODO check for change
|
||||
} else {
|
||||
tracker.trackAddition(item);
|
||||
}
|
||||
});
|
||||
|
||||
//TODO detect removals (need to re-add them ?)
|
||||
}
|
||||
|
||||
interface InputItem {
|
||||
item: Item;
|
||||
position: number;
|
||||
}
|
||||
|
||||
function inputMap(inputItems: Item[]): Map<string, InputItem> {
|
||||
return inputItems.reduce((map, item, idx) => {
|
||||
map.set(_uuid(item), { item, position: idx });
|
||||
export function detectChanges(tracker: ChangeTracker, inputItems: Item[], outputItems: Item[]): Item[] {
|
||||
const oututItemsByPage = groupByPage(outputItems).reduce((map: Map<number, Item[]>, pageItems) => {
|
||||
map.set(pageItems[0].page, pageItems);
|
||||
return map;
|
||||
}, new Map());
|
||||
|
||||
const mergedItems: Item[] = [];
|
||||
groupByPage(inputItems).forEach((inputPageItems) => {
|
||||
const page = inputPageItems[0].page;
|
||||
const outputPageItems = oututItemsByPage.get(page) || [];
|
||||
mergedItems.push(...detectPageChanges(tracker, inputPageItems, outputPageItems));
|
||||
});
|
||||
|
||||
return mergedItems;
|
||||
}
|
||||
|
||||
function detectPageChanges(tracker: ChangeTracker, inputItems: Item[], outputItems: Item[]): Item[] {
|
||||
const mergedItems: Item[] = [];
|
||||
const addedItems: Set<string> = new Set();
|
||||
let removals = 0;
|
||||
let additions = 0;
|
||||
let outputIndex = 0;
|
||||
for (let inputIdx = 0; inputIdx < inputItems.length; inputIdx++) {
|
||||
const inputItem = inputItems[inputIdx];
|
||||
if (addedItems.has(_uuid(inputItem))) {
|
||||
continue;
|
||||
}
|
||||
const positionInOutput = outputItems.findIndex((item) => item.uuid === inputItem.uuid);
|
||||
if (positionInOutput < 0) {
|
||||
tracker.trackRemoval(inputItem);
|
||||
mergedItems.push(inputItem);
|
||||
addedItems.add(_uuid(inputItem));
|
||||
removals++;
|
||||
} else if (positionInOutput === inputIdx + additions - removals) {
|
||||
mergedItems.push(outputItems[positionInOutput]);
|
||||
addedItems.add(_uuid(outputItems[positionInOutput]));
|
||||
outputIndex++;
|
||||
//TODO check for content change ?
|
||||
} else {
|
||||
for (let intermediateOutputIdx = outputIndex; intermediateOutputIdx < positionInOutput; intermediateOutputIdx++) {
|
||||
const outputItem = outputItems[intermediateOutputIdx];
|
||||
const positionInInput = inputItems.findIndex((item) => item.uuid === outputItem.uuid);
|
||||
if (positionInInput < 0) {
|
||||
tracker.trackAddition(outputItem);
|
||||
mergedItems.push(outputItem);
|
||||
addedItems.add(_uuid(outputItem));
|
||||
additions++;
|
||||
outputIndex++;
|
||||
} else {
|
||||
tracker.trackPositionalChange(outputItem, positionInInput - removals, intermediateOutputIdx - additions);
|
||||
mergedItems.push(outputItem);
|
||||
addedItems.add(_uuid(outputItem));
|
||||
outputIndex++;
|
||||
}
|
||||
}
|
||||
tracker.trackPositionalChange(outputItems[positionInOutput], inputIdx - removals, positionInOutput - additions);
|
||||
mergedItems.push(outputItems[positionInOutput]);
|
||||
addedItems.add(_uuid(outputItems[positionInOutput]));
|
||||
outputIndex++;
|
||||
//TODO check for content change ?
|
||||
}
|
||||
}
|
||||
for (let remaingOutputIndex = outputIndex; remaingOutputIndex < outputItems.length; remaingOutputIndex++) {
|
||||
tracker.trackAddition(outputItems[remaingOutputIndex]);
|
||||
mergedItems.push(outputItems[remaingOutputIndex]);
|
||||
additions++;
|
||||
}
|
||||
return mergedItems;
|
||||
}
|
||||
|
||||
function _uuid(item: Item): string {
|
||||
|
@ -27,6 +27,33 @@ test('itemsUnpacked', async () => {
|
||||
const result = new StageResult(descriptor, schema, pages, tracker, []);
|
||||
|
||||
expect(result.itemsUnpacked().map((item) => item.data['idx'])).toEqual([0, 1, 2, 3, 4, 5]);
|
||||
expect(result.itemsCleanedAndUnpacked().map((item) => item.data['idx'])).toEqual([0, 1, 2, 3, 4, 5]);
|
||||
});
|
||||
|
||||
test('itemsCleanedAndUnpacked', async () => {
|
||||
const tracker = new ChangeTracker();
|
||||
const itemMerger = new LineItemMerger(false);
|
||||
const descriptor = toDescriptor({ debug: { itemMerger } });
|
||||
const schema: AnnotatedColumn[] = [{ name: 'A' }];
|
||||
const flatItems = [
|
||||
...items(0, [
|
||||
{ idx: 0, line: 1 },
|
||||
{ idx: 1, line: 1 },
|
||||
{ idx: 2, line: 2 },
|
||||
]),
|
||||
...items(1, [{ idx: 3, line: 1 }]),
|
||||
...items(2, [
|
||||
{ idx: 4, line: 1 },
|
||||
{ idx: 5, line: 1 },
|
||||
]),
|
||||
];
|
||||
const pages = asPages(tracker, flatItems, itemMerger);
|
||||
tracker.trackRemoval(flatItems[1]);
|
||||
tracker.trackRemoval(flatItems[4]);
|
||||
const result = new StageResult(descriptor, schema, pages, tracker, []);
|
||||
|
||||
expect(result.itemsUnpacked().map((item) => item.data['idx'])).toEqual([0, 1, 2, 3, 4, 5]);
|
||||
expect(result.itemsCleanedAndUnpacked().map((item) => item.data['idx'])).toEqual([0, 2, 3, 5]);
|
||||
});
|
||||
|
||||
describe('select pages', () => {
|
||||
|
@ -1,33 +1,101 @@
|
||||
import ChangeTracker from 'src/debug/ChangeTracker';
|
||||
import { detectChanges } from 'src/debug/detectChanges';
|
||||
import { PositionChange, Direction } from 'src/debug/ChangeIndex';
|
||||
import { items } from 'test/testItems';
|
||||
import { PositionChange, Direction, Addition, Removal } from 'src/debug/ChangeIndex';
|
||||
import { idItem, idItems, items } from 'test/testItems';
|
||||
import Item from 'src/Item';
|
||||
|
||||
test('No changes', async () => {
|
||||
const inputItems = items(0, [
|
||||
{ id: 'A', line: '1', x: 1 },
|
||||
{ id: 'D', line: '1', x: 4 },
|
||||
{ id: 'C', line: '1', x: 3 },
|
||||
{ id: 'B', line: '1', x: 2 },
|
||||
]);
|
||||
const items = idItems(0, ['A', 'B', 'C']);
|
||||
|
||||
const tracker = new ChangeTracker();
|
||||
detectChanges(tracker, inputItems, inputItems);
|
||||
const debugItems = detectChanges(tracker, items, items);
|
||||
expect(tracker.changeCount()).toEqual(0);
|
||||
expect(debugItems).toEqual(items);
|
||||
});
|
||||
test('Positional changes', async () => {
|
||||
const inputItems = items(0, [
|
||||
{ id: 'A', line: '1', x: 1 },
|
||||
{ id: 'D', line: '1', x: 4 },
|
||||
{ id: 'C', line: '1', x: 3 },
|
||||
{ id: 'B', line: '1', x: 2 },
|
||||
]);
|
||||
|
||||
const transformedItems = [inputItems[0], inputItems[3], inputItems[2], inputItems[1]];
|
||||
test('Positional changes', async () => {
|
||||
const itemsIn = idItems(0, ['A', 'D', 'C', 'B']);
|
||||
const itemsOut = [itemsIn[0], itemsIn[3], itemsIn[2], itemsIn[1]];
|
||||
|
||||
const tracker = new ChangeTracker();
|
||||
const changes = detectChanges(tracker, inputItems, transformedItems);
|
||||
const debugItems = detectChanges(tracker, itemsIn, itemsOut);
|
||||
expect(tracker.changeCount()).toEqual(2);
|
||||
expect(tracker.change(inputItems[1])).toEqual(new PositionChange(Direction.DOWN, 2));
|
||||
expect(tracker.change(inputItems[3])).toEqual(new PositionChange(Direction.UP, 2));
|
||||
expect(tracker.change(itemsIn[1])).toEqual(new PositionChange(Direction.DOWN, 2));
|
||||
expect(tracker.change(itemsIn[3])).toEqual(new PositionChange(Direction.UP, 2));
|
||||
expect(tracker.change(itemsOut[1])).toEqual(new PositionChange(Direction.UP, 2));
|
||||
expect(tracker.change(itemsOut[3])).toEqual(new PositionChange(Direction.DOWN, 2));
|
||||
expect(debugItems).toEqual(itemsOut);
|
||||
});
|
||||
|
||||
test('Additions', async () => {
|
||||
const itemsIn = idItems(0, ['A', 'B', 'C']);
|
||||
const itemsOut = [itemsIn[0], idItem(0, 'A2'), itemsIn[1], itemsIn[2], idItem(0, 'C2')];
|
||||
|
||||
const tracker = new ChangeTracker();
|
||||
|
||||
const debugItems = detectChanges(tracker, itemsIn, itemsOut);
|
||||
expect(tracker.changeCount()).toEqual(2);
|
||||
expect(tracker.change(itemsOut[1])).toEqual(new Addition());
|
||||
expect(tracker.change(itemsOut[4])).toEqual(new Addition());
|
||||
expect(debugItems).toEqual(itemsOut);
|
||||
});
|
||||
|
||||
test('Removals', async () => {
|
||||
const itemsIn = idItems(0, ['A', 'B', 'C', 'D']);
|
||||
const itemsOut = [itemsIn[1], itemsIn[3]];
|
||||
|
||||
const tracker = new ChangeTracker();
|
||||
const debugItems = detectChanges(tracker, itemsIn, itemsOut);
|
||||
expect(tracker.changeCount()).toEqual(2);
|
||||
expect(tracker.change(debugItems[0])).toEqual(new Removal());
|
||||
expect(tracker.change(debugItems[2])).toEqual(new Removal());
|
||||
expect(debugItems).toEqual(itemsIn);
|
||||
});
|
||||
|
||||
test('Total Replacement', async () => {
|
||||
const itemsIn = idItems(0, ['A', 'B', 'C']);
|
||||
const itemsOut = idItems(0, ['D', 'E', 'F']);
|
||||
|
||||
const tracker = new ChangeTracker();
|
||||
const debugItems = detectChanges(tracker, itemsIn, itemsOut);
|
||||
expect(tracker.changeCount()).toEqual(6);
|
||||
expect(tracker.change(debugItems[0])).toEqual(new Removal());
|
||||
expect(tracker.change(debugItems[1])).toEqual(new Removal());
|
||||
expect(tracker.change(debugItems[2])).toEqual(new Removal());
|
||||
expect(tracker.change(debugItems[3])).toEqual(new Addition());
|
||||
expect(tracker.change(debugItems[4])).toEqual(new Addition());
|
||||
expect(tracker.change(debugItems[5])).toEqual(new Addition());
|
||||
expect(debugItems).toEqual([...itemsIn, ...itemsOut]);
|
||||
});
|
||||
|
||||
test('Mixed1', async () => {
|
||||
const itemsIn = idItems(0, ['A', 'C', 'D']);
|
||||
const itemsOut = [idItem(0, 'B'), itemsIn[2], itemsIn[1]];
|
||||
|
||||
const tracker = new ChangeTracker();
|
||||
const debugItems = detectChanges(tracker, itemsIn, itemsOut);
|
||||
expect(tracker.changeCount()).toEqual(4);
|
||||
expect(tracker.change(debugItems[0])).toEqual(new Removal());
|
||||
expect(tracker.change(debugItems[1])).toEqual(new Addition());
|
||||
expect(tracker.change(debugItems[2])).toEqual(new PositionChange(Direction.UP, 1));
|
||||
expect(tracker.change(debugItems[3])).toEqual(new PositionChange(Direction.DOWN, 1));
|
||||
expect(debugItems).toEqual([itemsIn[0], itemsOut[0], itemsIn[2], itemsIn[1]]);
|
||||
});
|
||||
|
||||
test('Mixed2', async () => {
|
||||
const itemsIn = idItems(0, ['A', 'B', 'C', 'D', 'E', 'X', 'Y']);
|
||||
const itemsOut = [itemsIn[0], itemsIn[5], itemsIn[6], idItem(0, 'Z'), itemsIn[3], itemsIn[4]];
|
||||
|
||||
const tracker = new ChangeTracker();
|
||||
const debugItems = detectChanges(tracker, itemsIn, itemsOut);
|
||||
expect(debugItems.map((item) => item.data['id'])).toEqual(['A', 'B', 'C', 'X', 'Y', 'Z', 'D', 'E']);
|
||||
expect(tracker.changeCount()).toEqual(7);
|
||||
expect(tracker.change(debugItems[0])).toBeUndefined();
|
||||
expect(tracker.change(debugItems[1])).toEqual(new Removal());
|
||||
expect(tracker.change(debugItems[2])).toEqual(new Removal());
|
||||
expect(tracker.change(debugItems[3])).toEqual(new PositionChange(Direction.UP, 2));
|
||||
expect(tracker.change(debugItems[4])).toEqual(new PositionChange(Direction.UP, 2));
|
||||
expect(tracker.change(debugItems[5])).toEqual(new Addition());
|
||||
expect(tracker.change(debugItems[6])).toEqual(new PositionChange(Direction.DOWN, 2));
|
||||
expect(tracker.change(debugItems[7])).toEqual(new PositionChange(Direction.DOWN, 2));
|
||||
});
|
||||
|
@ -4,6 +4,18 @@ export function items(page: number, data: object[]): Item[] {
|
||||
return data.map((data) => new Item(page, data));
|
||||
}
|
||||
|
||||
export function item(page: number, data: object): Item {
|
||||
return new Item(page, data);
|
||||
}
|
||||
|
||||
export function idItems(page: number, ids: string[]): Item[] {
|
||||
return ids.map((id) => new Item(page, { id }));
|
||||
}
|
||||
|
||||
export function idItem(page: number, id: string): Item {
|
||||
return new Item(page, { id });
|
||||
}
|
||||
|
||||
export function realisticItems(page: number, data: object[]): Item[] {
|
||||
return data.map(
|
||||
(data, idx) =>
|
||||
|
Loading…
Reference in New Issue
Block a user