From d8bc6d100becb29658d71d5b956cef044823d4c4 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Sun, 21 Feb 2021 08:23:51 +0100 Subject: [PATCH] Cleanup & simple line detection --- core/src/PdfParser.ts | 2 +- core/src/transformer/AdjustHeight.ts | 46 +++++++++---------- core/src/transformer/CacluclateStatistics.ts | 6 +-- core/src/transformer/CalculateCoordinates.ts | 18 ++++---- core/src/transformer/CompactLines.ts | 37 +++++++++++++-- core/src/transformer/ItemTransformer.ts | 2 +- core/src/transformer/transformerUtils.ts | 19 ++++++++ .../test/transformer/transformerUtils.test.ts | 31 +++++++++++++ ui/src/debug/ItemTable.svelte | 23 ++-------- ui/src/debug/formatValues.ts | 16 +++++++ 10 files changed, 139 insertions(+), 61 deletions(-) create mode 100644 core/src/transformer/transformerUtils.ts create mode 100644 core/test/transformer/transformerUtils.test.ts create mode 100644 ui/src/debug/formatValues.ts diff --git a/core/src/PdfParser.ts b/core/src/PdfParser.ts index bc714a0..96f7610 100644 --- a/core/src/PdfParser.ts +++ b/core/src/PdfParser.ts @@ -9,7 +9,7 @@ import ParseResult from './ParseResult'; export default class PdfParser { pdfjs: any; defaultParams: object; - schema = ['str', 'fontName', 'dir', 'transform', 'width', 'height']; + schema = ['transform', 'str', 'fontName', 'dir', 'width', 'height']; constructor(pdfjs: any, defaultParams = {}) { this.pdfjs = pdfjs; diff --git a/core/src/transformer/AdjustHeight.ts b/core/src/transformer/AdjustHeight.ts index cdfabb4..1ac53c7 100644 --- a/core/src/transformer/AdjustHeight.ts +++ b/core/src/transformer/AdjustHeight.ts @@ -3,6 +3,7 @@ import Item from '../Item'; import ItemResult from '../ItemResult'; import ItemTransformer from './ItemTransformer'; import TransformContext from './TransformContext'; +import { transformGroupedByPage } from './transformerUtils'; export default class AdjustHeight extends ItemTransformer { constructor() { @@ -11,30 +12,27 @@ export default class AdjustHeight extends ItemTransformer { }); } - transform(context: TransformContext, items: Item[]): ItemResult { - const newItems: Item[] = []; - let page = -1; - let pageViewport: PageViewport; - //TODO groupBy page + transform(context: TransformContext, inputItems: Item[]): ItemResult { let correctedHeights = 0; - items.forEach((item) => { - if (item.page !== page) { - pageViewport = context.pageViewports[item.page]; - page = page; - } - const itemTransform = item.data['transform']; - const itemHeight = item.data['height']; - const tx = pageViewport.transformFunction(itemTransform); - const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); - const dividedHeight = itemHeight / fontHeight; - const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight; - if (newHeight !== itemHeight) { - correctedHeights++; - newItems.push(item.withDataAddition({ height: newHeight })); - } else { - newItems.push(item); - } - }); - return { items, messages: [`${correctedHeights} corrected heights`] }; + return { + items: transformGroupedByPage(inputItems, (page, items) => { + const pageViewport = context.pageViewports[page]; + return items.map((item) => { + const itemTransform = item.data['transform']; + const itemHeight = item.data['height']; + const tx = pageViewport.transformFunction(itemTransform); + const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]); + const dividedHeight = itemHeight / fontHeight; + const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight; + if (newHeight === itemHeight) { + return item; + } else { + correctedHeights++; + return item.withDataAddition({ height: newHeight }); + } + }); + }), + messages: [`${correctedHeights} corrected heights`], + }; } } diff --git a/core/src/transformer/CacluclateStatistics.ts b/core/src/transformer/CacluclateStatistics.ts index dfbaae6..f93ea37 100644 --- a/core/src/transformer/CacluclateStatistics.ts +++ b/core/src/transformer/CacluclateStatistics.ts @@ -26,9 +26,9 @@ export default class CalculateStatistics extends ItemTransformer { let maxHeight = 0; let maxHeightFont; - items.forEach((item) => { - const itemHeight = item.data['height']; - const itemFont = item.data['fontName']; + items.forEach((inputItems) => { + const itemHeight = inputItems.data['height']; + const itemFont = inputItems.data['fontName']; heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1; fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1; if (itemHeight > maxHeight) { diff --git a/core/src/transformer/CalculateCoordinates.ts b/core/src/transformer/CalculateCoordinates.ts index 7423366..e063b4f 100644 --- a/core/src/transformer/CalculateCoordinates.ts +++ b/core/src/transformer/CalculateCoordinates.ts @@ -22,13 +22,15 @@ export default class CalculateCoordinates extends ItemTransformer { ); } - transform(_: TransformContext, items: Item[]): ItemResult { - const transformedItems = items.map((item) => { - const transform: number[] = item.data['transform']; - const x = transform[4]; - const y = transform[5]; - return item.withDataAddition({ x, y }); - }); - return { items: transformedItems, messages: [] }; + transform(_: TransformContext, inputItems: Item[]): ItemResult { + return { + items: inputItems.map((item) => { + const transform: number[] = item.data['transform']; + const x = transform[4]; + const y = transform[5]; + return item.withDataAddition({ x, y }); + }), + messages: [], + }; } } diff --git a/core/src/transformer/CompactLines.ts b/core/src/transformer/CompactLines.ts index 3e6ed12..e6b9a65 100644 --- a/core/src/transformer/CompactLines.ts +++ b/core/src/transformer/CompactLines.ts @@ -2,15 +2,42 @@ import Item from '../Item'; import ItemResult from '../ItemResult'; import ItemTransformer from './ItemTransformer'; import TransformContext from './TransformContext'; +import { transformGroupedByPage } from './transformerUtils'; export default class CompactLines extends ItemTransformer { constructor() { - super('Compact Lines', 'Combines items on the same y-axis', { - requireColumns: ['str', 'y'], - }); + super( + 'Compact Lines', + 'Combines items on the same y-axis', + { + requireColumns: ['str', 'y'], + }, + (incomingSchema) => { + return incomingSchema.reduce((schema, column) => { + if (column === 'x') { + return [...schema, 'line', 'x']; + } + return [...schema, column]; + }, new Array()); + }, + ); } - transform(_: TransformContext, items: Item[]): ItemResult { - return { items: items, messages: [] }; + transform(_: TransformContext, inputItems: Item[]): ItemResult { + return { + items: transformGroupedByPage(inputItems, (page, items) => { + let lineNumber = -1; + let lastY: number | undefined; + return items.map((item) => { + const y = item.data['y']; + if (!lastY || y < lastY) { + lineNumber++; + } + lastY = y; + return item.withDataAddition({ line: lineNumber }); + }); + }), + messages: [], + }; } } diff --git a/core/src/transformer/ItemTransformer.ts b/core/src/transformer/ItemTransformer.ts index e5b1c31..9b94770 100644 --- a/core/src/transformer/ItemTransformer.ts +++ b/core/src/transformer/ItemTransformer.ts @@ -33,5 +33,5 @@ export default abstract class ItemTransformer { this.schemaTransformer = schemaTransformer; } - abstract transform(context: TransformContext, items: Item[]): ItemResult; + abstract transform(context: TransformContext, inputItems: Item[]): ItemResult; } diff --git a/core/src/transformer/transformerUtils.ts b/core/src/transformer/transformerUtils.ts new file mode 100644 index 0000000..e1544d9 --- /dev/null +++ b/core/src/transformer/transformerUtils.ts @@ -0,0 +1,19 @@ +import Item from '../Item'; + +type PageItemTransformer = (page: number, items: Item[]) => Item[]; + +export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer) { + return new Array().concat( + ...items + .reduce((pageItems: Item[][], item: Item) => { + const lastPageItems = pageItems[pageItems.length - 1]; + if (!lastPageItems || item.page > lastPageItems[0]?.page) { + pageItems.push([item]); + } else { + lastPageItems.push(item); + } + return pageItems; + }, []) + .map((pageItems) => groupedTransformer(pageItems[0].page, pageItems)), + ); +} diff --git a/core/test/transformer/transformerUtils.test.ts b/core/test/transformer/transformerUtils.test.ts new file mode 100644 index 0000000..a430ebf --- /dev/null +++ b/core/test/transformer/transformerUtils.test.ts @@ -0,0 +1,31 @@ +import Item from 'src/Item'; +import { transformGroupedByPage } from 'src/transformer/transformerUtils'; + +describe('transformGroupedByPage', () => { + test('empty', async () => { + const transformedItems = transformGroupedByPage([], () => fail("shoudln't be called")); + expect(transformedItems).toEqual([]); + }); + + test('pipe through', async () => { + const pageItems = [ + [new Item(0, { id: 1 })], + [new Item(1, { id: 2 }), new Item(1, { id: 3 })], + [new Item(2, { id: 4 })], + ]; + const flattenedItems = new Array().concat(...pageItems); + const transformedItems = transformGroupedByPage(flattenedItems, (page, items) => { + expect(items).toEqual(pageItems[page]); + return items; + }); + expect(transformedItems).toEqual(flattenedItems); + }); + + test('change', async () => { + const input = [new Item(0, { v: 0 }), new Item(1, { v: 0 })]; + const transformedItems = transformGroupedByPage(input, (_, items) => { + return [items[0].withData({ v: 1 })]; + }); + expect(transformedItems).toEqual(input.map((item) => item.withData({ v: 1 }))); + }); +}); diff --git a/ui/src/debug/ItemTable.svelte b/ui/src/debug/ItemTable.svelte index d5bca61..8e33fdc 100644 --- a/ui/src/debug/ItemTable.svelte +++ b/ui/src/debug/ItemTable.svelte @@ -1,9 +1,10 @@