Cleanup & simple line detection

This commit is contained in:
Johannes Zillmann 2021-02-21 08:23:51 +01:00
parent 71fb6a23ff
commit d8bc6d100b
10 changed files with 139 additions and 61 deletions

View File

@ -9,7 +9,7 @@ import ParseResult from './ParseResult';
export default class PdfParser {
pdfjs: any;
defaultParams: object;
schema = ['str', 'fontName', 'dir', 'transform', 'width', 'height'];
schema = ['transform', 'str', 'fontName', 'dir', 'width', 'height'];
constructor(pdfjs: any, defaultParams = {}) {
this.pdfjs = pdfjs;

View File

@ -3,6 +3,7 @@ import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import { transformGroupedByPage } from './transformerUtils';
export default class AdjustHeight extends ItemTransformer {
constructor() {
@ -11,30 +12,27 @@ export default class AdjustHeight extends ItemTransformer {
});
}
transform(context: TransformContext, items: Item[]): ItemResult {
const newItems: Item[] = [];
let page = -1;
let pageViewport: PageViewport;
//TODO groupBy page
transform(context: TransformContext, inputItems: Item[]): ItemResult {
let correctedHeights = 0;
items.forEach((item) => {
if (item.page !== page) {
pageViewport = context.pageViewports[item.page];
page = page;
}
const itemTransform = item.data['transform'];
const itemHeight = item.data['height'];
const tx = pageViewport.transformFunction(itemTransform);
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
const dividedHeight = itemHeight / fontHeight;
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
if (newHeight !== itemHeight) {
correctedHeights++;
newItems.push(item.withDataAddition({ height: newHeight }));
} else {
newItems.push(item);
}
});
return { items, messages: [`${correctedHeights} corrected heights`] };
return {
items: transformGroupedByPage(inputItems, (page, items) => {
const pageViewport = context.pageViewports[page];
return items.map((item) => {
const itemTransform = item.data['transform'];
const itemHeight = item.data['height'];
const tx = pageViewport.transformFunction(itemTransform);
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
const dividedHeight = itemHeight / fontHeight;
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
if (newHeight === itemHeight) {
return item;
} else {
correctedHeights++;
return item.withDataAddition({ height: newHeight });
}
});
}),
messages: [`${correctedHeights} corrected heights`],
};
}
}

View File

@ -26,9 +26,9 @@ export default class CalculateStatistics extends ItemTransformer {
let maxHeight = 0;
let maxHeightFont;
items.forEach((item) => {
const itemHeight = item.data['height'];
const itemFont = item.data['fontName'];
items.forEach((inputItems) => {
const itemHeight = inputItems.data['height'];
const itemFont = inputItems.data['fontName'];
heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1;
fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1;
if (itemHeight > maxHeight) {

View File

@ -22,13 +22,15 @@ export default class CalculateCoordinates extends ItemTransformer {
);
}
transform(_: TransformContext, items: Item[]): ItemResult {
const transformedItems = items.map((item) => {
const transform: number[] = item.data['transform'];
const x = transform[4];
const y = transform[5];
return item.withDataAddition({ x, y });
});
return { items: transformedItems, messages: [] };
transform(_: TransformContext, inputItems: Item[]): ItemResult {
return {
items: inputItems.map((item) => {
const transform: number[] = item.data['transform'];
const x = transform[4];
const y = transform[5];
return item.withDataAddition({ x, y });
}),
messages: [],
};
}
}

View File

@ -2,15 +2,42 @@ import Item from '../Item';
import ItemResult from '../ItemResult';
import ItemTransformer from './ItemTransformer';
import TransformContext from './TransformContext';
import { transformGroupedByPage } from './transformerUtils';
export default class CompactLines extends ItemTransformer {
constructor() {
super('Compact Lines', 'Combines items on the same y-axis', {
requireColumns: ['str', 'y'],
});
super(
'Compact Lines',
'Combines items on the same y-axis',
{
requireColumns: ['str', 'y'],
},
(incomingSchema) => {
return incomingSchema.reduce((schema, column) => {
if (column === 'x') {
return [...schema, 'line', 'x'];
}
return [...schema, column];
}, new Array<string>());
},
);
}
transform(_: TransformContext, items: Item[]): ItemResult {
return { items: items, messages: [] };
transform(_: TransformContext, inputItems: Item[]): ItemResult {
return {
items: transformGroupedByPage(inputItems, (page, items) => {
let lineNumber = -1;
let lastY: number | undefined;
return items.map((item) => {
const y = item.data['y'];
if (!lastY || y < lastY) {
lineNumber++;
}
lastY = y;
return item.withDataAddition({ line: lineNumber });
});
}),
messages: [],
};
}
}

View File

@ -33,5 +33,5 @@ export default abstract class ItemTransformer {
this.schemaTransformer = schemaTransformer;
}
abstract transform(context: TransformContext, items: Item[]): ItemResult;
abstract transform(context: TransformContext, inputItems: Item[]): ItemResult;
}

View File

@ -0,0 +1,19 @@
import Item from '../Item';
type PageItemTransformer = (page: number, items: Item[]) => Item[];
export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer) {
return new Array<Item>().concat(
...items
.reduce((pageItems: Item[][], item: Item) => {
const lastPageItems = pageItems[pageItems.length - 1];
if (!lastPageItems || item.page > lastPageItems[0]?.page) {
pageItems.push([item]);
} else {
lastPageItems.push(item);
}
return pageItems;
}, [])
.map((pageItems) => groupedTransformer(pageItems[0].page, pageItems)),
);
}

View File

@ -0,0 +1,31 @@
import Item from 'src/Item';
import { transformGroupedByPage } from 'src/transformer/transformerUtils';
describe('transformGroupedByPage', () => {
test('empty', async () => {
const transformedItems = transformGroupedByPage([], () => fail("shoudln't be called"));
expect(transformedItems).toEqual([]);
});
test('pipe through', async () => {
const pageItems = [
[new Item(0, { id: 1 })],
[new Item(1, { id: 2 }), new Item(1, { id: 3 })],
[new Item(2, { id: 4 })],
];
const flattenedItems = new Array<Item>().concat(...pageItems);
const transformedItems = transformGroupedByPage(flattenedItems, (page, items) => {
expect(items).toEqual(pageItems[page]);
return items;
});
expect(transformedItems).toEqual(flattenedItems);
});
test('change', async () => {
const input = [new Item(0, { v: 0 }), new Item(1, { v: 0 })];
const transformedItems = transformGroupedByPage(input, (_, items) => {
return [items[0].withData({ v: 1 })];
});
expect(transformedItems).toEqual(input.map((item) => item.withData({ v: 1 })));
});
});

View File

@ -1,9 +1,10 @@
<script>
import { scale, fade } from 'svelte/transition';
import inView from '../actions/inView';
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
import type Item from '@core/Item';
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
import inView from '../actions/inView';
import { formatValue } from './formatValues';
export let schema: AnnotatedColumn[];
export let itemsByPage: [number, Item[]][];
@ -23,22 +24,6 @@
}
}
function format(value: object) {
if (typeof value === 'number') {
return (value as number).toFixed(2);
}
if (typeof value === 'object' && typeof Array.isArray(value)) {
let array = value as Array<object>;
if (array.length > 0 && typeof array[0] === 'number') {
array = (array.map((element) =>
((element as unknown) as number).toFixed(2)
) as unknown) as Array<object>;
}
return '[' + array.join(', ') + ']';
}
return value;
}
function calculateNextPageToRenderTo() {
if (renderedMaxPage >= maxPage) {
return;
@ -88,7 +73,7 @@
{/if}
<td>{itemIdx}</td>
{#each schema as column}
<td class="select-all">{format(item.data[column.name])}</td>
<td class="select-all">{formatValue(item.data[column.name])}</td>
{/each}
</tr>
{/each}

View File

@ -0,0 +1,16 @@
export function formatValue(value: object) {
if (Number.isInteger(value)) {
return value;
}
if (typeof value === 'number') {
return (value as number).toFixed(2);
}
if (typeof value === 'object' && typeof Array.isArray(value)) {
let array = value as Array<object>;
if (array.length > 0 && typeof array[0] === 'number') {
array = (array.map((element) => ((element as unknown) as number).toFixed(2)) as unknown) as Array<object>;
}
return '[' + array.join(', ') + ']';
}
return value;
}