mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-24 11:33:49 +02:00
Globals propagation infrastructure
This commit is contained in:
parent
1a6bddf460
commit
202da9b005
@ -9,6 +9,7 @@ import { asPages } from './debug/Page';
|
||||
import EvaluationTracker from './transformer/EvaluationTracker';
|
||||
import ChangeTracker from './debug/ChangeTracker';
|
||||
import PageViewport from './parse/PageViewport';
|
||||
import Globals from './transformer/Globals';
|
||||
|
||||
export default class Debugger {
|
||||
private transformers: ItemTransformer[];
|
||||
@ -34,13 +35,19 @@ export default class Debugger {
|
||||
for (let idx = 0; idx < stageIndex + 1; idx++) {
|
||||
if (!this.stageResultCache[idx]) {
|
||||
const evaluations = new EvaluationTracker();
|
||||
const context = new TransformContext(this.fontMap, this.pageViewports, evaluations);
|
||||
const transformer = this.transformers[idx - 1];
|
||||
const previousStageResult: StageResult = this.stageResultCache[idx - 1];
|
||||
const context = new TransformContext(
|
||||
this.fontMap,
|
||||
this.pageViewports,
|
||||
previousStageResult.globals,
|
||||
evaluations,
|
||||
);
|
||||
const previousItems = previousStageResult.itemsCleanedAndUnpacked();
|
||||
const inputSchema = toSimpleSchema(previousStageResult);
|
||||
const outputSchema = transformer.schemaTransformer(inputSchema);
|
||||
const itemResult = transformer.transform(context, [...previousItems]);
|
||||
const globals = new Globals(previousStageResult.globals).withValues(itemResult.globals);
|
||||
|
||||
const changes = new ChangeTracker();
|
||||
const items = detectChanges(changes, previousItems, itemResult.items);
|
||||
@ -52,6 +59,7 @@ export default class Debugger {
|
||||
|
||||
this.stageResultCache.push(
|
||||
new StageResult(
|
||||
globals,
|
||||
transformer.descriptor,
|
||||
toAnnotatedSchema(inputSchema, outputSchema),
|
||||
pages,
|
||||
|
@ -1,7 +1,8 @@
|
||||
import type Item from './Item';
|
||||
import GlobalValue from './transformer/GlobalValue';
|
||||
|
||||
export default interface ItemResult {
|
||||
items: Item[];
|
||||
messages: string[];
|
||||
globals?: object;
|
||||
globals?: GlobalValue<any>[];
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import ParseResult from './ParseResult';
|
||||
import Debugger from './Debugger';
|
||||
import { assert } from './assert';
|
||||
import TransformContext from './transformer/TransformContext';
|
||||
import Globals from './transformer/Globals';
|
||||
|
||||
export default class PdfPipeline {
|
||||
parser: PdfParser;
|
||||
@ -29,9 +30,12 @@ export default class PdfPipeline {
|
||||
const parseResult = await this.parse(src, progressListener);
|
||||
this.verifyRequiredColumns(parseResult.schema, this.transformers);
|
||||
let items = parseResult.items;
|
||||
let globals = new Globals();
|
||||
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports, globals);
|
||||
this.transformers.forEach((transformer) => {
|
||||
const context = new TransformContext(parseResult.fontMap, parseResult.pageViewports);
|
||||
items = transformer.transform(context, items).items;
|
||||
const result = transformer.transform(context, items);
|
||||
globals = globals.withValues(result.globals);
|
||||
items = result.items;
|
||||
});
|
||||
parseResult.items = items;
|
||||
return parseResult;
|
||||
|
@ -7,9 +7,11 @@ import ChangeTracker from './ChangeTracker';
|
||||
import ItemGroup from './ItemGroup';
|
||||
import EvaluationIndex from '../transformer/EvaluationIndex';
|
||||
import EvaluationTracker from '../transformer/EvaluationTracker';
|
||||
import Globals from '../transformer/Globals';
|
||||
|
||||
export default class StageResult {
|
||||
constructor(
|
||||
public globals: Globals,
|
||||
public descriptor: TransformDescriptor,
|
||||
public schema: AnnotatedColumn[],
|
||||
public pages: Page[],
|
||||
@ -86,5 +88,13 @@ export function initialStage(inputSchema: string[], inputItems: Item[]): StageRe
|
||||
inputItems.length
|
||||
} items`,
|
||||
];
|
||||
return new StageResult(toDescriptor({ debug: { showAll: true } }), schema, pages, evaluations, changes, messages);
|
||||
return new StageResult(
|
||||
new Globals(),
|
||||
toDescriptor({ debug: { showAll: true } }),
|
||||
schema,
|
||||
pages,
|
||||
evaluations,
|
||||
changes,
|
||||
messages,
|
||||
);
|
||||
}
|
||||
|
@ -3,6 +3,9 @@ import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import FontType from '../FontType';
|
||||
import GlobalDefinition from './GlobalDefinition';
|
||||
|
||||
export const MAX_HEIGHT = new GlobalDefinition<number>('maxHeight');
|
||||
|
||||
export default class CalculateStatistics extends ItemTransformer {
|
||||
constructor() {
|
||||
@ -82,14 +85,14 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
|
||||
return {
|
||||
items: items,
|
||||
globals: {
|
||||
mostUsedHeight: mostUsedHeight,
|
||||
mostUsedFont: mostUsedFont,
|
||||
mostUsedDistance: mostUsedDistance,
|
||||
maxHeight: maxHeight,
|
||||
maxHeightFont: maxHeightFont,
|
||||
fontToFormats: fontToType,
|
||||
},
|
||||
globals: [MAX_HEIGHT.value(maxHeight)],
|
||||
// globals2: {
|
||||
// mostUsedHeight: mostUsedHeight,
|
||||
// mostUsedFont: mostUsedFont,
|
||||
// mostUsedDistance: mostUsedDistance,
|
||||
// maxHeightFont: maxHeightFont,
|
||||
// fontToFormats: fontToType,
|
||||
// },
|
||||
messages: [
|
||||
'Items per height: ' + JSON.stringify(heightToOccurrence),
|
||||
'Items per font: ' + JSON.stringify(fontToOccurrence),
|
||||
|
14
core/src/transformer/GlobalDefinition.ts
Normal file
14
core/src/transformer/GlobalDefinition.ts
Normal file
@ -0,0 +1,14 @@
|
||||
import { assertDefined, assertNot } from 'src/assert';
|
||||
import GlobalValue from './GlobalValue';
|
||||
|
||||
export default class GlobalDefinition<T> {
|
||||
constructor(public key: string) {}
|
||||
|
||||
value(value: T) {
|
||||
return new GlobalValue(this, value);
|
||||
}
|
||||
|
||||
overrideValue(value: T) {
|
||||
return new GlobalValue(this, value, true);
|
||||
}
|
||||
}
|
6
core/src/transformer/GlobalValue.ts
Normal file
6
core/src/transformer/GlobalValue.ts
Normal file
@ -0,0 +1,6 @@
|
||||
import { assertDefined, assertNot } from 'src/assert';
|
||||
import GlobalDefinition from './GlobalDefinition';
|
||||
|
||||
export default class GlobalValue<T> {
|
||||
constructor(public definition: GlobalDefinition<T>, public value: T, public override: boolean = false) {}
|
||||
}
|
47
core/src/transformer/Globals.ts
Normal file
47
core/src/transformer/Globals.ts
Normal file
@ -0,0 +1,47 @@
|
||||
import GlobalDefinition from './GlobalDefinition';
|
||||
import { assertDefined, assertNot } from '../assert';
|
||||
import GlobalValue from './GlobalValue';
|
||||
|
||||
export default class Globals {
|
||||
map: Map<string, any>;
|
||||
constructor(globals?: Globals) {
|
||||
this.map = globals ? new Map(globals.map) : new Map();
|
||||
}
|
||||
|
||||
keys(): string[] {
|
||||
return [...this.map.keys()];
|
||||
}
|
||||
|
||||
isDefined<T>(definition: GlobalDefinition<T>): boolean {
|
||||
return typeof this.map.get(definition.key) !== 'undefined';
|
||||
}
|
||||
|
||||
get<T>(definition: GlobalDefinition<T>): T {
|
||||
const element = this.map.get(definition.key) as T;
|
||||
assertDefined(
|
||||
element,
|
||||
`No global with key '${definition.key}' registered. Only [${[...this.map.keys()].join(',')}]`,
|
||||
);
|
||||
return element;
|
||||
}
|
||||
|
||||
set<T>(definition: GlobalDefinition<T>, value: T) {
|
||||
assertNot(this.isDefined(definition), `Global with key '${definition.key}' already registered.`);
|
||||
this.map.set(definition.key, value);
|
||||
}
|
||||
|
||||
override<T>(definition: GlobalDefinition<T>, value: T) {
|
||||
this.map.set(definition.key, value);
|
||||
}
|
||||
|
||||
withValues(values: GlobalValue<any>[] | undefined): Globals {
|
||||
values?.forEach((value) => {
|
||||
if (value.override) {
|
||||
this.override(value.definition, value.value);
|
||||
} else {
|
||||
this.set(value.definition, value.value);
|
||||
}
|
||||
});
|
||||
return this;
|
||||
}
|
||||
}
|
@ -1,6 +1,8 @@
|
||||
import Item from '../Item';
|
||||
import PageViewport from '../parse/PageViewport';
|
||||
import EvaluationTracker from './EvaluationTracker';
|
||||
import GlobalDefinition from './GlobalDefinition';
|
||||
import Globals from './Globals';
|
||||
|
||||
export default class TransformContext {
|
||||
pageCount: number;
|
||||
@ -8,6 +10,7 @@ export default class TransformContext {
|
||||
constructor(
|
||||
public fontMap: Map<string, object>,
|
||||
public pageViewports: PageViewport[],
|
||||
private globals: Globals,
|
||||
private evaluations = new EvaluationTracker(),
|
||||
) {
|
||||
this.pageCount = pageViewports.length;
|
||||
@ -16,4 +19,12 @@ export default class TransformContext {
|
||||
trackEvaluation(item: Item) {
|
||||
this.evaluations.trackEvaluation(item);
|
||||
}
|
||||
|
||||
globalIsDefined<T>(definition: GlobalDefinition<T>): boolean {
|
||||
return this.globals.isDefined(definition);
|
||||
}
|
||||
|
||||
getGlobal<T>(definition: GlobalDefinition<T>): T {
|
||||
return this.globals.get(definition);
|
||||
}
|
||||
}
|
||||
|
@ -6,6 +6,7 @@ import AnnotatedColumn from 'src/debug/AnnotatedColumn';
|
||||
import Page, { asPages } from 'src/debug/Page';
|
||||
import { items } from '../testItems';
|
||||
import LineItemMerger from 'src/debug/LineItemMerger';
|
||||
import Globals from 'src/transformer/Globals';
|
||||
|
||||
test('itemsUnpacked', async () => {
|
||||
const evaluationTracker = new EvaluationTracker();
|
||||
@ -26,7 +27,7 @@ test('itemsUnpacked', async () => {
|
||||
]),
|
||||
];
|
||||
const pages = asPages(evaluationTracker, changeTracker, flatItems, itemMerger);
|
||||
const result = new StageResult(descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
const result = new StageResult(new Globals(), descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
|
||||
expect(result.itemsUnpacked().map((item) => item.data['idx'])).toEqual([0, 1, 2, 3, 4, 5]);
|
||||
expect(result.itemsCleanedAndUnpacked().map((item) => item.data['idx'])).toEqual([0, 1, 2, 3, 4, 5]);
|
||||
@ -53,7 +54,7 @@ test('itemsCleanedAndUnpacked', async () => {
|
||||
const pages = asPages(evaluationTracker, changeTracker, flatItems, itemMerger);
|
||||
changeTracker.trackRemoval(flatItems[1]);
|
||||
changeTracker.trackRemoval(flatItems[4]);
|
||||
const result = new StageResult(descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
const result = new StageResult(new Globals(), descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
|
||||
expect(result.itemsUnpacked().map((item) => item.data['idx'])).toEqual([0, 1, 2, 3, 4, 5]);
|
||||
expect(result.itemsCleanedAndUnpacked().map((item) => item.data['idx'])).toEqual([0, 2, 3, 5]);
|
||||
@ -84,7 +85,7 @@ describe('select pages', () => {
|
||||
changeTracker.trackAddition(flatItems[2]);
|
||||
changeTracker.trackAddition(flatItems[4]);
|
||||
const pages = asPages(evaluationTracker, changeTracker, flatItems, itemMerger);
|
||||
const result = new StageResult(descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
const result = new StageResult(new Globals(), descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
|
||||
const allGrouped = result.selectPages(false, true);
|
||||
expect(allGrouped.map((page) => page.index)).toEqual([0]);
|
||||
@ -122,7 +123,7 @@ describe('select pages', () => {
|
||||
]),
|
||||
];
|
||||
const pages = asPages(evaluationTracker, changeTracker, flatItems, itemMerger);
|
||||
const result = new StageResult(descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
const result = new StageResult(new Globals(), descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
|
||||
const allGrouped = result.selectPages(false, true);
|
||||
expect(allGrouped.map((page) => page.index)).toEqual([0, 1, 2]);
|
||||
@ -171,7 +172,7 @@ describe('select pages', () => {
|
||||
changeTracker.trackAddition(flatItems[3]);
|
||||
changeTracker.trackAddition(flatItems[5]);
|
||||
const pages = asPages(evaluationTracker, changeTracker, flatItems, itemMerger);
|
||||
const result = new StageResult(descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
const result = new StageResult(new Globals(), descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
|
||||
const allGrouped = result.selectPages(false, true);
|
||||
expect(allGrouped.map((page) => page.index)).toEqual([0, 1, 2]);
|
||||
@ -217,7 +218,7 @@ describe('select pages', () => {
|
||||
]),
|
||||
];
|
||||
const pages = asPages(evaluationTracker, changeTracker, flatItems, itemMerger);
|
||||
const result = new StageResult(descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
const result = new StageResult(new Globals(), descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
|
||||
const relevantGrouped = result.selectPages(true, true);
|
||||
expect(relevantGrouped.map((page) => page.index)).toEqual([0, 1, 2]);
|
||||
@ -237,7 +238,7 @@ describe('select pages', () => {
|
||||
...items(2, [{ idx: 4 }, { idx: 5 }]),
|
||||
];
|
||||
const pages = asPages(evaluationTracker, changeTracker, flatItems);
|
||||
const result = new StageResult(descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
const result = new StageResult(new Globals(), descriptor, schema, pages, evaluationTracker, changeTracker, []);
|
||||
|
||||
const relevantGrouped = result.selectPages(true, true);
|
||||
expect(relevantGrouped.map((page) => page.index)).toEqual([0, 1, 2]);
|
||||
|
51
core/test/transformer/Globals.test.ts
Normal file
51
core/test/transformer/Globals.test.ts
Normal file
@ -0,0 +1,51 @@
|
||||
import GlobalDefinition from 'src/transformer/GlobalDefinition';
|
||||
import Globals from 'src/transformer/Globals';
|
||||
|
||||
const MyGlobalString = new GlobalDefinition<string>('myGlobalString');
|
||||
const MyGlobalNumber = new GlobalDefinition<number>('myGlobalNumber');
|
||||
|
||||
test('not set', async () => {
|
||||
const globals = new Globals();
|
||||
globals.set(MyGlobalString, '23');
|
||||
expect(globals.isDefined(MyGlobalNumber)).toBeFalsy();
|
||||
expect(() => globals.get(MyGlobalNumber)).toThrow(
|
||||
`No global with key '${MyGlobalNumber.key}' registered. Only [${MyGlobalString.key}]`,
|
||||
);
|
||||
});
|
||||
|
||||
test('set', async () => {
|
||||
const globals = new Globals();
|
||||
globals.set(MyGlobalNumber, 24);
|
||||
|
||||
expect(globals.isDefined(MyGlobalNumber)).toBeTruthy();
|
||||
expect(globals.get(MyGlobalNumber)).toEqual(24);
|
||||
expect(globals.keys()).toEqual([MyGlobalNumber.key]);
|
||||
});
|
||||
|
||||
test('set, already exists', async () => {
|
||||
const globals = new Globals();
|
||||
globals.set(MyGlobalNumber, 24);
|
||||
expect(() => globals.set(MyGlobalNumber, 25)).toThrow("Global with key 'myGlobalNumber' already registered.");
|
||||
});
|
||||
|
||||
test('override', async () => {
|
||||
const globals = new Globals();
|
||||
globals.set(MyGlobalNumber, 24);
|
||||
globals.override(MyGlobalNumber, 25);
|
||||
|
||||
expect(globals.isDefined(MyGlobalNumber)).toBeTruthy();
|
||||
expect(globals.get(MyGlobalNumber)).toEqual(25);
|
||||
});
|
||||
|
||||
test('inheritence', async () => {
|
||||
const globals1 = new Globals();
|
||||
globals1.set(MyGlobalNumber, 24);
|
||||
const globals2 = new Globals(globals1);
|
||||
globals2.set(MyGlobalString, 'myKey');
|
||||
|
||||
expect(globals2.keys()).toEqual([MyGlobalNumber.key, MyGlobalString.key]);
|
||||
expect(globals2.isDefined(MyGlobalNumber)).toBeTruthy();
|
||||
expect(globals2.isDefined(MyGlobalString)).toBeTruthy();
|
||||
expect(globals2.get(MyGlobalNumber)).toEqual(24);
|
||||
expect(globals2.get(MyGlobalString)).toEqual('myKey');
|
||||
});
|
Loading…
x
Reference in New Issue
Block a user