mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-03 20:28:54 +01:00
Cleanup & simple line detection
This commit is contained in:
parent
71fb6a23ff
commit
d8bc6d100b
@ -9,7 +9,7 @@ import ParseResult from './ParseResult';
|
||||
export default class PdfParser {
|
||||
pdfjs: any;
|
||||
defaultParams: object;
|
||||
schema = ['str', 'fontName', 'dir', 'transform', 'width', 'height'];
|
||||
schema = ['transform', 'str', 'fontName', 'dir', 'width', 'height'];
|
||||
|
||||
constructor(pdfjs: any, defaultParams = {}) {
|
||||
this.pdfjs = pdfjs;
|
||||
|
@ -3,6 +3,7 @@ import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import { transformGroupedByPage } from './transformerUtils';
|
||||
|
||||
export default class AdjustHeight extends ItemTransformer {
|
||||
constructor() {
|
||||
@ -11,30 +12,27 @@ export default class AdjustHeight extends ItemTransformer {
|
||||
});
|
||||
}
|
||||
|
||||
transform(context: TransformContext, items: Item[]): ItemResult {
|
||||
const newItems: Item[] = [];
|
||||
let page = -1;
|
||||
let pageViewport: PageViewport;
|
||||
//TODO groupBy page
|
||||
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
||||
let correctedHeights = 0;
|
||||
items.forEach((item) => {
|
||||
if (item.page !== page) {
|
||||
pageViewport = context.pageViewports[item.page];
|
||||
page = page;
|
||||
}
|
||||
return {
|
||||
items: transformGroupedByPage(inputItems, (page, items) => {
|
||||
const pageViewport = context.pageViewports[page];
|
||||
return items.map((item) => {
|
||||
const itemTransform = item.data['transform'];
|
||||
const itemHeight = item.data['height'];
|
||||
const tx = pageViewport.transformFunction(itemTransform);
|
||||
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||
const dividedHeight = itemHeight / fontHeight;
|
||||
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
||||
if (newHeight !== itemHeight) {
|
||||
correctedHeights++;
|
||||
newItems.push(item.withDataAddition({ height: newHeight }));
|
||||
if (newHeight === itemHeight) {
|
||||
return item;
|
||||
} else {
|
||||
newItems.push(item);
|
||||
correctedHeights++;
|
||||
return item.withDataAddition({ height: newHeight });
|
||||
}
|
||||
});
|
||||
return { items, messages: [`${correctedHeights} corrected heights`] };
|
||||
}),
|
||||
messages: [`${correctedHeights} corrected heights`],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -26,9 +26,9 @@ export default class CalculateStatistics extends ItemTransformer {
|
||||
let maxHeight = 0;
|
||||
let maxHeightFont;
|
||||
|
||||
items.forEach((item) => {
|
||||
const itemHeight = item.data['height'];
|
||||
const itemFont = item.data['fontName'];
|
||||
items.forEach((inputItems) => {
|
||||
const itemHeight = inputItems.data['height'];
|
||||
const itemFont = inputItems.data['fontName'];
|
||||
heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1;
|
||||
fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1;
|
||||
if (itemHeight > maxHeight) {
|
||||
|
@ -22,13 +22,15 @@ export default class CalculateCoordinates extends ItemTransformer {
|
||||
);
|
||||
}
|
||||
|
||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
||||
const transformedItems = items.map((item) => {
|
||||
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||
return {
|
||||
items: inputItems.map((item) => {
|
||||
const transform: number[] = item.data['transform'];
|
||||
const x = transform[4];
|
||||
const y = transform[5];
|
||||
return item.withDataAddition({ x, y });
|
||||
});
|
||||
return { items: transformedItems, messages: [] };
|
||||
}),
|
||||
messages: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -2,15 +2,42 @@ import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import { transformGroupedByPage } from './transformerUtils';
|
||||
|
||||
export default class CompactLines extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Compact Lines', 'Combines items on the same y-axis', {
|
||||
super(
|
||||
'Compact Lines',
|
||||
'Combines items on the same y-axis',
|
||||
{
|
||||
requireColumns: ['str', 'y'],
|
||||
});
|
||||
},
|
||||
(incomingSchema) => {
|
||||
return incomingSchema.reduce((schema, column) => {
|
||||
if (column === 'x') {
|
||||
return [...schema, 'line', 'x'];
|
||||
}
|
||||
return [...schema, column];
|
||||
}, new Array<string>());
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
||||
return { items: items, messages: [] };
|
||||
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||
return {
|
||||
items: transformGroupedByPage(inputItems, (page, items) => {
|
||||
let lineNumber = -1;
|
||||
let lastY: number | undefined;
|
||||
return items.map((item) => {
|
||||
const y = item.data['y'];
|
||||
if (!lastY || y < lastY) {
|
||||
lineNumber++;
|
||||
}
|
||||
lastY = y;
|
||||
return item.withDataAddition({ line: lineNumber });
|
||||
});
|
||||
}),
|
||||
messages: [],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -33,5 +33,5 @@ export default abstract class ItemTransformer {
|
||||
this.schemaTransformer = schemaTransformer;
|
||||
}
|
||||
|
||||
abstract transform(context: TransformContext, items: Item[]): ItemResult;
|
||||
abstract transform(context: TransformContext, inputItems: Item[]): ItemResult;
|
||||
}
|
||||
|
19
core/src/transformer/transformerUtils.ts
Normal file
19
core/src/transformer/transformerUtils.ts
Normal file
@ -0,0 +1,19 @@
|
||||
import Item from '../Item';
|
||||
|
||||
type PageItemTransformer = (page: number, items: Item[]) => Item[];
|
||||
|
||||
export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer) {
|
||||
return new Array<Item>().concat(
|
||||
...items
|
||||
.reduce((pageItems: Item[][], item: Item) => {
|
||||
const lastPageItems = pageItems[pageItems.length - 1];
|
||||
if (!lastPageItems || item.page > lastPageItems[0]?.page) {
|
||||
pageItems.push([item]);
|
||||
} else {
|
||||
lastPageItems.push(item);
|
||||
}
|
||||
return pageItems;
|
||||
}, [])
|
||||
.map((pageItems) => groupedTransformer(pageItems[0].page, pageItems)),
|
||||
);
|
||||
}
|
31
core/test/transformer/transformerUtils.test.ts
Normal file
31
core/test/transformer/transformerUtils.test.ts
Normal file
@ -0,0 +1,31 @@
|
||||
import Item from 'src/Item';
|
||||
import { transformGroupedByPage } from 'src/transformer/transformerUtils';
|
||||
|
||||
describe('transformGroupedByPage', () => {
|
||||
test('empty', async () => {
|
||||
const transformedItems = transformGroupedByPage([], () => fail("shoudln't be called"));
|
||||
expect(transformedItems).toEqual([]);
|
||||
});
|
||||
|
||||
test('pipe through', async () => {
|
||||
const pageItems = [
|
||||
[new Item(0, { id: 1 })],
|
||||
[new Item(1, { id: 2 }), new Item(1, { id: 3 })],
|
||||
[new Item(2, { id: 4 })],
|
||||
];
|
||||
const flattenedItems = new Array<Item>().concat(...pageItems);
|
||||
const transformedItems = transformGroupedByPage(flattenedItems, (page, items) => {
|
||||
expect(items).toEqual(pageItems[page]);
|
||||
return items;
|
||||
});
|
||||
expect(transformedItems).toEqual(flattenedItems);
|
||||
});
|
||||
|
||||
test('change', async () => {
|
||||
const input = [new Item(0, { v: 0 }), new Item(1, { v: 0 })];
|
||||
const transformedItems = transformGroupedByPage(input, (_, items) => {
|
||||
return [items[0].withData({ v: 1 })];
|
||||
});
|
||||
expect(transformedItems).toEqual(input.map((item) => item.withData({ v: 1 })));
|
||||
});
|
||||
});
|
@ -1,9 +1,10 @@
|
||||
<script>
|
||||
import { scale, fade } from 'svelte/transition';
|
||||
import inView from '../actions/inView';
|
||||
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
|
||||
import type Item from '@core/Item';
|
||||
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
|
||||
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
|
||||
import inView from '../actions/inView';
|
||||
import { formatValue } from './formatValues';
|
||||
|
||||
export let schema: AnnotatedColumn[];
|
||||
export let itemsByPage: [number, Item[]][];
|
||||
@ -23,22 +24,6 @@
|
||||
}
|
||||
}
|
||||
|
||||
function format(value: object) {
|
||||
if (typeof value === 'number') {
|
||||
return (value as number).toFixed(2);
|
||||
}
|
||||
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
||||
let array = value as Array<object>;
|
||||
if (array.length > 0 && typeof array[0] === 'number') {
|
||||
array = (array.map((element) =>
|
||||
((element as unknown) as number).toFixed(2)
|
||||
) as unknown) as Array<object>;
|
||||
}
|
||||
return '[' + array.join(', ') + ']';
|
||||
}
|
||||
return value;
|
||||
}
|
||||
|
||||
function calculateNextPageToRenderTo() {
|
||||
if (renderedMaxPage >= maxPage) {
|
||||
return;
|
||||
@ -88,7 +73,7 @@
|
||||
{/if}
|
||||
<td>{itemIdx}</td>
|
||||
{#each schema as column}
|
||||
<td class="select-all">{format(item.data[column.name])}</td>
|
||||
<td class="select-all">{formatValue(item.data[column.name])}</td>
|
||||
{/each}
|
||||
</tr>
|
||||
{/each}
|
||||
|
16
ui/src/debug/formatValues.ts
Normal file
16
ui/src/debug/formatValues.ts
Normal file
@ -0,0 +1,16 @@
|
||||
export function formatValue(value: object) {
|
||||
if (Number.isInteger(value)) {
|
||||
return value;
|
||||
}
|
||||
if (typeof value === 'number') {
|
||||
return (value as number).toFixed(2);
|
||||
}
|
||||
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
||||
let array = value as Array<object>;
|
||||
if (array.length > 0 && typeof array[0] === 'number') {
|
||||
array = (array.map((element) => ((element as unknown) as number).toFixed(2)) as unknown) as Array<object>;
|
||||
}
|
||||
return '[' + array.join(', ') + ']';
|
||||
}
|
||||
return value;
|
||||
}
|
Loading…
Reference in New Issue
Block a user