From 0910f7b148ae1a29a253268ec3240b0ea4627615 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Sun, 21 Feb 2021 13:23:31 +0100 Subject: [PATCH] Grouping of line items --- core/src/Debugger.ts | 1 + core/src/ItemMerger.ts | 6 + core/src/TransformDescriptor.ts | 24 ++++ core/src/TransformerDescription.ts | 5 - core/src/debug/StageResult.ts | 2 + core/src/support/ItemGroup.ts | 17 +++ core/src/support/Page.ts | 6 + core/src/support/itemUtils.ts | 55 +++++++++ core/src/transformer/AdjustHeight.ts | 2 +- core/src/transformer/CompactLines.ts | 28 ++++- core/src/transformer/ItemTransformer.ts | 15 +-- core/src/transformer/transformerUtils.ts | 19 --- core/test/Debugger.test.ts | 8 +- core/test/PdfPipeline.test.ts | 4 +- core/test/support/itemUtils.test.ts | 112 ++++++++++++++++++ core/test/transformer/CompactLines.test.ts | 51 ++++++++ .../test/transformer/transformerUtils.test.ts | 31 ----- ui/src/debug/DebugView.svelte | 17 +-- ui/src/debug/ItemTable.svelte | 81 +++++++++---- 19 files changed, 378 insertions(+), 106 deletions(-) create mode 100644 core/src/ItemMerger.ts create mode 100644 core/src/TransformDescriptor.ts delete mode 100644 core/src/TransformerDescription.ts create mode 100644 core/src/support/ItemGroup.ts create mode 100644 core/src/support/Page.ts create mode 100644 core/src/support/itemUtils.ts delete mode 100644 core/src/transformer/transformerUtils.ts create mode 100644 core/test/support/itemUtils.test.ts create mode 100644 core/test/transformer/CompactLines.test.ts delete mode 100644 core/test/transformer/transformerUtils.test.ts diff --git a/core/src/Debugger.ts b/core/src/Debugger.ts index a5e1e81..ebf5a02 100644 --- a/core/src/Debugger.ts +++ b/core/src/Debugger.ts @@ -39,6 +39,7 @@ export default class Debugger { const outputSchema = transformer.schemaTransformer(inputSchema); const itemResult = transformer.transform(this.context, [...this.stageResultCache[idx - 1].items]); this.stageResultCache.push({ + descriptor: transformer.descriptor, schema: toAnnotatedSchema(inputSchema, outputSchema), ...itemResult, }); diff --git a/core/src/ItemMerger.ts b/core/src/ItemMerger.ts new file mode 100644 index 0000000..92d656e --- /dev/null +++ b/core/src/ItemMerger.ts @@ -0,0 +1,6 @@ +import type Item from './Item'; + +export default interface ItemMerger { + groupKey: string; + merge(items: Item[]): Item; +} diff --git a/core/src/TransformDescriptor.ts b/core/src/TransformDescriptor.ts new file mode 100644 index 0000000..26ed355 --- /dev/null +++ b/core/src/TransformDescriptor.ts @@ -0,0 +1,24 @@ +import type ItemMerger from './ItemMerger'; + +export default interface TransformDescriptor { + readonly requireColumns: string[]; + readonly consumesGlobels: string[]; + readonly producesGlobels: string[]; + /** + * If this is set, the debug UI will group items and display a merged item. + */ + readonly itemMerger?: ItemMerger; +} + +const defaults: TransformDescriptor = { + requireColumns: [], + consumesGlobels: [], + producesGlobels: [], +}; + +export function toDescriptor(partial: Partial): TransformDescriptor { + return { + ...defaults, + ...partial, + }; +} diff --git a/core/src/TransformerDescription.ts b/core/src/TransformerDescription.ts deleted file mode 100644 index b3b7c97..0000000 --- a/core/src/TransformerDescription.ts +++ /dev/null @@ -1,5 +0,0 @@ -export default interface TransformerDescriptor { - readonly requireColumns?: string[]; - readonly consumesGlobels?: string[]; - readonly producesGlobels?: string[]; -} diff --git a/core/src/debug/StageResult.ts b/core/src/debug/StageResult.ts index 5070799..240b16e 100644 --- a/core/src/debug/StageResult.ts +++ b/core/src/debug/StageResult.ts @@ -1,7 +1,9 @@ +import TransformDescriptor from '../TransformDescriptor'; import Item from '../Item'; import AnnotatedColumn from './AnnotatedColumn'; export default interface StageResult { + descriptor?: TransformDescriptor; schema: AnnotatedColumn[]; items: Item[]; messages: string[]; diff --git a/core/src/support/ItemGroup.ts b/core/src/support/ItemGroup.ts new file mode 100644 index 0000000..0489fed --- /dev/null +++ b/core/src/support/ItemGroup.ts @@ -0,0 +1,17 @@ +import { assertDefined } from '../assert'; +import type Item from '../Item'; +import type ItemMerger from '../ItemMerger'; + +export default class ItemGroup { + top: Item; + elements: Item[]; + + constructor(top: Item, items: Item[] = []) { + this.top = top; + this.elements = items; + } + + hasMany(): boolean { + return this.elements.length > 0; + } +} diff --git a/core/src/support/Page.ts b/core/src/support/Page.ts new file mode 100644 index 0000000..ca04b7a --- /dev/null +++ b/core/src/support/Page.ts @@ -0,0 +1,6 @@ +import type ItemGroup from './ItemGroup'; + +export default interface Page { + index: number; + itemGroups: ItemGroup[]; +} diff --git a/core/src/support/itemUtils.ts b/core/src/support/itemUtils.ts new file mode 100644 index 0000000..b8c7754 --- /dev/null +++ b/core/src/support/itemUtils.ts @@ -0,0 +1,55 @@ +import ItemMerger from 'src/ItemMerger'; +import Item from '../Item'; +import ItemGroup from './ItemGroup'; +import Page from './Page'; + +type PageItemTransformer = (page: number, items: Item[]) => Item[]; + +export function groupByPage(items: Item[]): Item[][] { + return items.reduce((pageItems: Item[][], item: Item) => { + const lastPageItems = pageItems[pageItems.length - 1]; + if (!lastPageItems || item.page > lastPageItems[0]?.page) { + pageItems.push([item]); + } else { + lastPageItems.push(item); + } + return pageItems; + }, []); +} + +export function groupByElement(items: Item[], elementName: string): Item[][] { + return items.reduce((groupedItems: Item[][], item: Item) => { + const lastGroupItems = groupedItems[groupedItems.length - 1]; + if (!lastGroupItems || item.data[elementName] !== lastGroupItems[0]?.data[elementName]) { + groupedItems.push([item]); + } else { + lastGroupItems.push(item); + } + return groupedItems; + }, []); +} + +export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer): Item[] { + return new Array().concat( + ...groupByPage(items).map((pageItems) => groupedTransformer(pageItems[0].page, pageItems)), + ); +} + +export function asPages(items: Item[], itemMerger?: ItemMerger): Page[] { + return groupByPage(items).map((pageItems: Item[]) => { + let itemGroups: ItemGroup[]; + if (itemMerger) { + itemGroups = groupByElement(pageItems, itemMerger.groupKey).map((groupItems) => { + if (groupItems.length > 1) { + const top = itemMerger.merge(groupItems); + return new ItemGroup(top, groupItems); + } else { + return new ItemGroup(groupItems[0]); + } + }); + } else { + itemGroups = pageItems.map((item) => new ItemGroup(item)); + } + return { index: pageItems[0].page, itemGroups } as Page; + }); +} diff --git a/core/src/transformer/AdjustHeight.ts b/core/src/transformer/AdjustHeight.ts index 1ac53c7..e2cc78c 100644 --- a/core/src/transformer/AdjustHeight.ts +++ b/core/src/transformer/AdjustHeight.ts @@ -3,7 +3,7 @@ import Item from '../Item'; import ItemResult from '../ItemResult'; import ItemTransformer from './ItemTransformer'; import TransformContext from './TransformContext'; -import { transformGroupedByPage } from './transformerUtils'; +import { transformGroupedByPage } from '../support/itemUtils'; export default class AdjustHeight extends ItemTransformer { constructor() { diff --git a/core/src/transformer/CompactLines.ts b/core/src/transformer/CompactLines.ts index e6b9a65..23b04b1 100644 --- a/core/src/transformer/CompactLines.ts +++ b/core/src/transformer/CompactLines.ts @@ -2,7 +2,7 @@ import Item from '../Item'; import ItemResult from '../ItemResult'; import ItemTransformer from './ItemTransformer'; import TransformContext from './TransformContext'; -import { transformGroupedByPage } from './transformerUtils'; +import { transformGroupedByPage } from '../support/itemUtils'; export default class CompactLines extends ItemTransformer { constructor() { @@ -11,6 +11,10 @@ export default class CompactLines extends ItemTransformer { 'Combines items on the same y-axis', { requireColumns: ['str', 'y'], + itemMerger: { + groupKey: 'line', + merge: mergeLineItems, + }, }, (incomingSchema) => { return incomingSchema.reduce((schema, column) => { @@ -41,3 +45,25 @@ export default class CompactLines extends ItemTransformer { }; } } + +function mergeLineItems(items: Item[]): Item { + const page = items[0].page; + const line = items[0].data['line']; + const str = items.map((item) => item.data['str']).join(' '); + const x = Math.min(...items.map((item) => item.data['x'])); + const y = Math.min(...items.map((item) => item.data['y'])); + const width = items.reduce((sum, item) => sum + item.data['width'], 0); + const height = Math.max(...items.map((item) => item.data['height'])); + const fontNames = [...new Set(items.map((item) => item.data['fontName']))]; + const directions = [...new Set(items.map((item) => item.data['dir']))]; + return new Item(page, { + str, + line, + x, + y, + width, + height, + fontName: fontNames, + dir: directions, + }); +} diff --git a/core/src/transformer/ItemTransformer.ts b/core/src/transformer/ItemTransformer.ts index 9b94770..7cd73ac 100644 --- a/core/src/transformer/ItemTransformer.ts +++ b/core/src/transformer/ItemTransformer.ts @@ -1,4 +1,4 @@ -import type TransformerDescriptor from '../TransformerDescription'; +import TransformDescriptor, { toDescriptor } from '../TransformDescriptor'; import type TransformContext from './TransformContext'; import type Item from '../Item'; import type ItemResult from '../ItemResult'; @@ -11,25 +11,18 @@ type SchemaTransformer = (incomingSchema: string[]) => string[]; export default abstract class ItemTransformer { readonly name: string; readonly description: string; - readonly descriptor: TransformerDescriptor; + readonly descriptor: TransformDescriptor; readonly schemaTransformer: SchemaTransformer; constructor( name: string, description: string, - descriptor: TransformerDescriptor, + descriptorPartial: Partial, schemaTransformer: SchemaTransformer = (schema) => schema, ) { this.name = name; this.description = description; - this.descriptor = { - ...{ - consumesGlobels: [], - producesGlobels: [], - requireColumns: [], - }, - ...descriptor, - }; + this.descriptor = toDescriptor(descriptorPartial); this.schemaTransformer = schemaTransformer; } diff --git a/core/src/transformer/transformerUtils.ts b/core/src/transformer/transformerUtils.ts deleted file mode 100644 index e1544d9..0000000 --- a/core/src/transformer/transformerUtils.ts +++ /dev/null @@ -1,19 +0,0 @@ -import Item from '../Item'; - -type PageItemTransformer = (page: number, items: Item[]) => Item[]; - -export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer) { - return new Array().concat( - ...items - .reduce((pageItems: Item[][], item: Item) => { - const lastPageItems = pageItems[pageItems.length - 1]; - if (!lastPageItems || item.page > lastPageItems[0]?.page) { - pageItems.push([item]); - } else { - lastPageItems.push(item); - } - return pageItems; - }, []) - .map((pageItems) => groupedTransformer(pageItems[0].page, pageItems)), - ); -} diff --git a/core/test/Debugger.test.ts b/core/test/Debugger.test.ts index 1d9f611..850dc57 100644 --- a/core/test/Debugger.test.ts +++ b/core/test/Debugger.test.ts @@ -1,7 +1,7 @@ import Debugger from 'src/Debugger'; import Item from 'src/Item'; import ItemTransformer from 'src/transformer/ItemTransformer'; -import TransformerDescriptor from 'src/TransformerDescription'; +import TransformDescriptor from 'src/TransformDescriptor'; import TransformContext from 'src/transformer/TransformContext'; import ItemResult from 'src/ItemResult'; import ColumnAnnotation from 'src/debug/ColumnAnnotation'; @@ -9,7 +9,7 @@ import AnnotatedColumn from 'src/debug/AnnotatedColumn'; class TestTransformer extends ItemTransformer { items: Item[]; - constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[], items: Item[]) { + constructor(name: string, descriptor: Partial, outputSchema: string[], items: Item[]) { super(name, `Description for ${name}`, descriptor, (incomingSchema) => outputSchema); this.items = items; } @@ -30,7 +30,7 @@ test('basic debug', async () => { const trans1Items = parsedItems.map((item) => item.withData({ C: `c=${item.value('A')}+${item.value('B')}` })); const transformers = [new TestTransformer('Trans1', trans1Desc, trans1Schema, trans1Items)]; - const debug = new Debugger(parsedSchema, parsedItems, { pageViewports: [] }, transformers); + const debug = new Debugger(parsedSchema, parsedItems, { fontMap: new Map(), pageViewports: [] }, transformers); expect(debug.stageNames).toEqual(['Parse Result', 'Trans1']); expect(debug.stageResults(0).schema).toEqual(parsedSchema.map((column) => ({ name: column }))); @@ -47,7 +47,7 @@ describe('build schemas', () => { function calculateSchema(inputSchema: string[], outputSchema: string[]): AnnotatedColumn[] { const transformers = [new TestTransformer('Trans1', {}, outputSchema, items)]; - const debug = new Debugger(inputSchema, items, { pageViewports: [] }, transformers); + const debug = new Debugger(inputSchema, items, { fontMap: new Map(), pageViewports: [] }, transformers); return debug.stageResults(1).schema; } diff --git a/core/test/PdfPipeline.test.ts b/core/test/PdfPipeline.test.ts index d25ee77..64d27a5 100644 --- a/core/test/PdfPipeline.test.ts +++ b/core/test/PdfPipeline.test.ts @@ -1,4 +1,4 @@ -import TransformerDescriptor from 'src/TransformerDescription'; +import TransformDescriptor from 'src/TransformDescriptor'; import Item from 'src/Item'; import ItemResult from 'src/ItemResult'; import ItemTransformer from 'src/transformer/ItemTransformer'; @@ -9,7 +9,7 @@ import * as fs from 'fs'; import PdfPipeline from 'src/PdfPipeline'; class TestSchemaTransformer extends ItemTransformer { - constructor(name: string, descriptor: TransformerDescriptor, outputSchema: string[] | undefined = undefined) { + constructor(name: string, descriptor: Partial, outputSchema: string[] | undefined = undefined) { if (outputSchema) { super(name, `Description for ${name}`, descriptor, (_) => outputSchema); } else { diff --git a/core/test/support/itemUtils.test.ts b/core/test/support/itemUtils.test.ts new file mode 100644 index 0000000..1cd2440 --- /dev/null +++ b/core/test/support/itemUtils.test.ts @@ -0,0 +1,112 @@ +import Item from 'src/Item'; +import Page from 'src/support/Page'; +import { groupByPage, groupByElement, transformGroupedByPage, asPages } from 'src/support/itemUtils'; +import ItemGroup from 'src/support/ItemGroup'; +import ItemMerger from 'src/ItemMerger'; +import ItemTransformer from 'src/transformer/ItemTransformer'; + +describe('groupByPage', () => { + test('empty', async () => { + expect(groupByPage([])).toEqual([]); + }); + + test('group', async () => { + const pageItems = [ + [new Item(0, { id: 1 })], + [new Item(1, { id: 2 }), new Item(1, { id: 3 })], + [new Item(2, { id: 4 })], + ]; + const flattenedItems = new Array().concat(...pageItems); + const transformedItems = groupByPage(flattenedItems); + expect(transformedItems).toEqual(pageItems); + }); +}); + +describe('groupByElement', () => { + test('empty', async () => { + expect(groupByElement([], 'line')).toEqual([]); + }); + + test('group', async () => { + const groupedItems = [ + [new Item(0, { line: 1, id: 1 })], + [new Item(0, { line: 2, id: 2 }), new Item(0, { line: 2, id: 3 })], + [new Item(0, { line: 3, id: 4 })], + ]; + const flattenedItems = new Array().concat(...groupedItems); + const transformedItems = groupByElement(flattenedItems, 'line'); + expect(transformedItems).toEqual(groupedItems); + }); +}); + +describe('transformGroupedByPage', () => { + test('empty', async () => { + const transformedItems = transformGroupedByPage([], () => fail("shoudln't be called")); + expect(transformedItems).toEqual([]); + }); + + test('pipe through', async () => { + const pageItems = [ + [new Item(0, { id: 1 })], + [new Item(1, { id: 2 }), new Item(1, { id: 3 })], + [new Item(2, { id: 4 })], + ]; + const flattenedItems = new Array().concat(...pageItems); + const transformedItems = transformGroupedByPage(flattenedItems, (page, items) => { + expect(items).toEqual(pageItems[page]); + return items; + }); + expect(transformedItems).toEqual(flattenedItems); + }); + + test('change', async () => { + const input = [new Item(0, { v: 0 }), new Item(1, { v: 0 })]; + const transformedItems = transformGroupedByPage(input, (_, items) => { + return [items[0].withData({ v: 1 })]; + }); + expect(transformedItems).toEqual(input.map((item) => item.withData({ v: 1 }))); + }); +}); + +describe('asPages', () => { + test('empty', async () => { + expect(groupByPage([])).toEqual([]); + }); + + test('no merger', async () => { + const pageItems = [ + [new Item(0, { id: 1, line: 1 })], + [new Item(1, { id: 2, line: 1 }), new Item(1, { id: 3, line: 1 }), new Item(1, { id: 4, line: 2 })], + [new Item(2, { id: 5, line: 1 })], + ]; + const flattenedItems = new Array().concat(...pageItems); + const pages = asPages(flattenedItems); + expect(pages).toEqual([ + { index: 0, itemGroups: pageItems[0].map((item) => new ItemGroup(item)) }, + { index: 1, itemGroups: pageItems[1].map((item) => new ItemGroup(item)) }, + { index: 2, itemGroups: pageItems[2].map((item) => new ItemGroup(item)) }, + ] as Page[]); + }); + + test('merger', async () => { + const pageItems = [ + [new Item(0, { id: 1, line: 1 })], + [new Item(1, { id: 2, line: 1 }), new Item(1, { id: 3, line: 1 }), new Item(1, { id: 4, line: 2 })], + [new Item(2, { id: 5, line: 1 })], + ]; + const flattenedItems = new Array().concat(...pageItems); + const merger: ItemMerger = { groupKey: 'line', merge: (items) => items[0] }; + const pages = asPages(flattenedItems, merger); + expect(pages).toEqual([ + { index: 0, itemGroups: pageItems[0].map((item) => new ItemGroup(item)) }, + { + index: 1, + itemGroups: [ + new ItemGroup(merger.merge(pageItems[1].slice(0, 2)), pageItems[1].slice(0, 2)), + new ItemGroup(pageItems[1][2]), + ], + }, + { index: 2, itemGroups: pageItems[2].map((item) => new ItemGroup(item)) }, + ] as Page[]); + }); +}); diff --git a/core/test/transformer/CompactLines.test.ts b/core/test/transformer/CompactLines.test.ts new file mode 100644 index 0000000..8e78c90 --- /dev/null +++ b/core/test/transformer/CompactLines.test.ts @@ -0,0 +1,51 @@ +import Item from 'src/Item'; +import CompactLines from 'src/transformer/CompactLines'; +test('Item Merger', async () => { + const itemMerger = new CompactLines().descriptor.itemMerger; + expect(itemMerger?.groupKey).toEqual('line'); + + const mergedItem = itemMerger?.merge([ + new Item(0, { + line: 2, + x: 240, + y: 585, + str: 'Dies ist eine Test-PDF', + fontName: 'g_d0_f2', + dir: 'ltr', + width: 108.62, + height: 11, + }), + new Item(0, { + line: 2, + x: 352.69, + y: 585, + str: '.', + fontName: 'g_d0_f2', + dir: 'ltr', + width: 3.06, + height: 11, + }), + new Item(0, { + line: 2, + x: 348, + y: 588, + str: '1', + fontName: 'g_d0_f2', + dir: 'ltr', + width: 4.08, + height: 7.33, + }), + ]); + expect(mergedItem?.withoutUuid()).toEqual( + new Item(0, { + line: 2, + x: 240, + y: 585, + str: 'Dies ist eine Test-PDF . 1', + fontName: ['g_d0_f2'], + dir: ['ltr'], + width: 115.76, + height: 11, + }).withoutUuid(), + ); +}); diff --git a/core/test/transformer/transformerUtils.test.ts b/core/test/transformer/transformerUtils.test.ts deleted file mode 100644 index a430ebf..0000000 --- a/core/test/transformer/transformerUtils.test.ts +++ /dev/null @@ -1,31 +0,0 @@ -import Item from 'src/Item'; -import { transformGroupedByPage } from 'src/transformer/transformerUtils'; - -describe('transformGroupedByPage', () => { - test('empty', async () => { - const transformedItems = transformGroupedByPage([], () => fail("shoudln't be called")); - expect(transformedItems).toEqual([]); - }); - - test('pipe through', async () => { - const pageItems = [ - [new Item(0, { id: 1 })], - [new Item(1, { id: 2 }), new Item(1, { id: 3 })], - [new Item(2, { id: 4 })], - ]; - const flattenedItems = new Array().concat(...pageItems); - const transformedItems = transformGroupedByPage(flattenedItems, (page, items) => { - expect(items).toEqual(pageItems[page]); - return items; - }); - expect(transformedItems).toEqual(flattenedItems); - }); - - test('change', async () => { - const input = [new Item(0, { v: 0 }), new Item(1, { v: 0 })]; - const transformedItems = transformGroupedByPage(input, (_, items) => { - return [items[0].withData({ v: 1 })]; - }); - expect(transformedItems).toEqual(input.map((item) => item.withData({ v: 1 }))); - }); -}); diff --git a/ui/src/debug/DebugView.svelte b/ui/src/debug/DebugView.svelte index 4ac0471..f3df96a 100644 --- a/ui/src/debug/DebugView.svelte +++ b/ui/src/debug/DebugView.svelte @@ -3,8 +3,11 @@ import Icon from 'fa-svelte'; import { faMapPin as pin } from '@fortawesome/free-solid-svg-icons/faMapPin'; import { BookOpen, ArrowLeft, ArrowRight } from 'svelte-hero-icons'; + import type Debugger from '@core/Debugger'; import type Item from '@core/Item'; + import { asPages } from '../../../core/src/support/itemUtils'; + import Popup from '../components/Popup.svelte'; import PageSelectionPopup from './PageSelectionPopup.svelte'; import ItemTable from './ItemTable.svelte'; @@ -21,17 +24,9 @@ $: stageResult = debug.stageResults(currentStage); $: pageIsPinned = !isNaN(pinnedPage); $: pagesNumbers = new Set(stageResult.items.map((item) => item.page)); + $: pages = asPages(stageResult.items, stageResult.descriptor?.itemMerger); $: maxPage = Math.max(...pagesNumbers); - $: itemsByPage = [ - ...stageResult.items.reduce((map, item) => { - if (!map.has(item.page)) { - map.set(item.page, []); - } - map.get(item.page).push(item); - return map; - }, new Map()), - ]; - $: visiblePages = pageIsPinned ? itemsByPage.filter(([page]) => page === pinnedPage) : itemsByPage; + $: visiblePages = pageIsPinned ? pages.filter((page) => page.index === pinnedPage) : pages;
@@ -97,7 +92,7 @@ - +