mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-03 20:28:54 +01:00
Cleanup & simple line detection
This commit is contained in:
parent
71fb6a23ff
commit
d8bc6d100b
@ -9,7 +9,7 @@ import ParseResult from './ParseResult';
|
|||||||
export default class PdfParser {
|
export default class PdfParser {
|
||||||
pdfjs: any;
|
pdfjs: any;
|
||||||
defaultParams: object;
|
defaultParams: object;
|
||||||
schema = ['str', 'fontName', 'dir', 'transform', 'width', 'height'];
|
schema = ['transform', 'str', 'fontName', 'dir', 'width', 'height'];
|
||||||
|
|
||||||
constructor(pdfjs: any, defaultParams = {}) {
|
constructor(pdfjs: any, defaultParams = {}) {
|
||||||
this.pdfjs = pdfjs;
|
this.pdfjs = pdfjs;
|
||||||
|
@ -3,6 +3,7 @@ import Item from '../Item';
|
|||||||
import ItemResult from '../ItemResult';
|
import ItemResult from '../ItemResult';
|
||||||
import ItemTransformer from './ItemTransformer';
|
import ItemTransformer from './ItemTransformer';
|
||||||
import TransformContext from './TransformContext';
|
import TransformContext from './TransformContext';
|
||||||
|
import { transformGroupedByPage } from './transformerUtils';
|
||||||
|
|
||||||
export default class AdjustHeight extends ItemTransformer {
|
export default class AdjustHeight extends ItemTransformer {
|
||||||
constructor() {
|
constructor() {
|
||||||
@ -11,30 +12,27 @@ export default class AdjustHeight extends ItemTransformer {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(context: TransformContext, items: Item[]): ItemResult {
|
transform(context: TransformContext, inputItems: Item[]): ItemResult {
|
||||||
const newItems: Item[] = [];
|
|
||||||
let page = -1;
|
|
||||||
let pageViewport: PageViewport;
|
|
||||||
//TODO groupBy page
|
|
||||||
let correctedHeights = 0;
|
let correctedHeights = 0;
|
||||||
items.forEach((item) => {
|
return {
|
||||||
if (item.page !== page) {
|
items: transformGroupedByPage(inputItems, (page, items) => {
|
||||||
pageViewport = context.pageViewports[item.page];
|
const pageViewport = context.pageViewports[page];
|
||||||
page = page;
|
return items.map((item) => {
|
||||||
}
|
const itemTransform = item.data['transform'];
|
||||||
const itemTransform = item.data['transform'];
|
const itemHeight = item.data['height'];
|
||||||
const itemHeight = item.data['height'];
|
const tx = pageViewport.transformFunction(itemTransform);
|
||||||
const tx = pageViewport.transformFunction(itemTransform);
|
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
||||||
const fontHeight = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
|
const dividedHeight = itemHeight / fontHeight;
|
||||||
const dividedHeight = itemHeight / fontHeight;
|
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
||||||
const newHeight = Number.isNaN(dividedHeight) || dividedHeight <= 1 ? itemHeight : dividedHeight;
|
if (newHeight === itemHeight) {
|
||||||
if (newHeight !== itemHeight) {
|
return item;
|
||||||
correctedHeights++;
|
} else {
|
||||||
newItems.push(item.withDataAddition({ height: newHeight }));
|
correctedHeights++;
|
||||||
} else {
|
return item.withDataAddition({ height: newHeight });
|
||||||
newItems.push(item);
|
}
|
||||||
}
|
});
|
||||||
});
|
}),
|
||||||
return { items, messages: [`${correctedHeights} corrected heights`] };
|
messages: [`${correctedHeights} corrected heights`],
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -26,9 +26,9 @@ export default class CalculateStatistics extends ItemTransformer {
|
|||||||
let maxHeight = 0;
|
let maxHeight = 0;
|
||||||
let maxHeightFont;
|
let maxHeightFont;
|
||||||
|
|
||||||
items.forEach((item) => {
|
items.forEach((inputItems) => {
|
||||||
const itemHeight = item.data['height'];
|
const itemHeight = inputItems.data['height'];
|
||||||
const itemFont = item.data['fontName'];
|
const itemFont = inputItems.data['fontName'];
|
||||||
heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1;
|
heightToOccurrence[itemHeight] = heightToOccurrence[itemHeight] ? heightToOccurrence[itemHeight] + 1 : 1;
|
||||||
fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1;
|
fontToOccurrence[itemFont] = fontToOccurrence[itemFont] ? fontToOccurrence[itemFont] + 1 : 1;
|
||||||
if (itemHeight > maxHeight) {
|
if (itemHeight > maxHeight) {
|
||||||
|
@ -22,13 +22,15 @@ export default class CalculateCoordinates extends ItemTransformer {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||||
const transformedItems = items.map((item) => {
|
return {
|
||||||
const transform: number[] = item.data['transform'];
|
items: inputItems.map((item) => {
|
||||||
const x = transform[4];
|
const transform: number[] = item.data['transform'];
|
||||||
const y = transform[5];
|
const x = transform[4];
|
||||||
return item.withDataAddition({ x, y });
|
const y = transform[5];
|
||||||
});
|
return item.withDataAddition({ x, y });
|
||||||
return { items: transformedItems, messages: [] };
|
}),
|
||||||
|
messages: [],
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2,15 +2,42 @@ import Item from '../Item';
|
|||||||
import ItemResult from '../ItemResult';
|
import ItemResult from '../ItemResult';
|
||||||
import ItemTransformer from './ItemTransformer';
|
import ItemTransformer from './ItemTransformer';
|
||||||
import TransformContext from './TransformContext';
|
import TransformContext from './TransformContext';
|
||||||
|
import { transformGroupedByPage } from './transformerUtils';
|
||||||
|
|
||||||
export default class CompactLines extends ItemTransformer {
|
export default class CompactLines extends ItemTransformer {
|
||||||
constructor() {
|
constructor() {
|
||||||
super('Compact Lines', 'Combines items on the same y-axis', {
|
super(
|
||||||
requireColumns: ['str', 'y'],
|
'Compact Lines',
|
||||||
});
|
'Combines items on the same y-axis',
|
||||||
|
{
|
||||||
|
requireColumns: ['str', 'y'],
|
||||||
|
},
|
||||||
|
(incomingSchema) => {
|
||||||
|
return incomingSchema.reduce((schema, column) => {
|
||||||
|
if (column === 'x') {
|
||||||
|
return [...schema, 'line', 'x'];
|
||||||
|
}
|
||||||
|
return [...schema, column];
|
||||||
|
}, new Array<string>());
|
||||||
|
},
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
transform(_: TransformContext, items: Item[]): ItemResult {
|
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||||
return { items: items, messages: [] };
|
return {
|
||||||
|
items: transformGroupedByPage(inputItems, (page, items) => {
|
||||||
|
let lineNumber = -1;
|
||||||
|
let lastY: number | undefined;
|
||||||
|
return items.map((item) => {
|
||||||
|
const y = item.data['y'];
|
||||||
|
if (!lastY || y < lastY) {
|
||||||
|
lineNumber++;
|
||||||
|
}
|
||||||
|
lastY = y;
|
||||||
|
return item.withDataAddition({ line: lineNumber });
|
||||||
|
});
|
||||||
|
}),
|
||||||
|
messages: [],
|
||||||
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -33,5 +33,5 @@ export default abstract class ItemTransformer {
|
|||||||
this.schemaTransformer = schemaTransformer;
|
this.schemaTransformer = schemaTransformer;
|
||||||
}
|
}
|
||||||
|
|
||||||
abstract transform(context: TransformContext, items: Item[]): ItemResult;
|
abstract transform(context: TransformContext, inputItems: Item[]): ItemResult;
|
||||||
}
|
}
|
||||||
|
19
core/src/transformer/transformerUtils.ts
Normal file
19
core/src/transformer/transformerUtils.ts
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
import Item from '../Item';
|
||||||
|
|
||||||
|
type PageItemTransformer = (page: number, items: Item[]) => Item[];
|
||||||
|
|
||||||
|
export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer) {
|
||||||
|
return new Array<Item>().concat(
|
||||||
|
...items
|
||||||
|
.reduce((pageItems: Item[][], item: Item) => {
|
||||||
|
const lastPageItems = pageItems[pageItems.length - 1];
|
||||||
|
if (!lastPageItems || item.page > lastPageItems[0]?.page) {
|
||||||
|
pageItems.push([item]);
|
||||||
|
} else {
|
||||||
|
lastPageItems.push(item);
|
||||||
|
}
|
||||||
|
return pageItems;
|
||||||
|
}, [])
|
||||||
|
.map((pageItems) => groupedTransformer(pageItems[0].page, pageItems)),
|
||||||
|
);
|
||||||
|
}
|
31
core/test/transformer/transformerUtils.test.ts
Normal file
31
core/test/transformer/transformerUtils.test.ts
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
import Item from 'src/Item';
|
||||||
|
import { transformGroupedByPage } from 'src/transformer/transformerUtils';
|
||||||
|
|
||||||
|
describe('transformGroupedByPage', () => {
|
||||||
|
test('empty', async () => {
|
||||||
|
const transformedItems = transformGroupedByPage([], () => fail("shoudln't be called"));
|
||||||
|
expect(transformedItems).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('pipe through', async () => {
|
||||||
|
const pageItems = [
|
||||||
|
[new Item(0, { id: 1 })],
|
||||||
|
[new Item(1, { id: 2 }), new Item(1, { id: 3 })],
|
||||||
|
[new Item(2, { id: 4 })],
|
||||||
|
];
|
||||||
|
const flattenedItems = new Array<Item>().concat(...pageItems);
|
||||||
|
const transformedItems = transformGroupedByPage(flattenedItems, (page, items) => {
|
||||||
|
expect(items).toEqual(pageItems[page]);
|
||||||
|
return items;
|
||||||
|
});
|
||||||
|
expect(transformedItems).toEqual(flattenedItems);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('change', async () => {
|
||||||
|
const input = [new Item(0, { v: 0 }), new Item(1, { v: 0 })];
|
||||||
|
const transformedItems = transformGroupedByPage(input, (_, items) => {
|
||||||
|
return [items[0].withData({ v: 1 })];
|
||||||
|
});
|
||||||
|
expect(transformedItems).toEqual(input.map((item) => item.withData({ v: 1 })));
|
||||||
|
});
|
||||||
|
});
|
@ -1,9 +1,10 @@
|
|||||||
<script>
|
<script>
|
||||||
import { scale, fade } from 'svelte/transition';
|
import { scale, fade } from 'svelte/transition';
|
||||||
import inView from '../actions/inView';
|
|
||||||
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
|
|
||||||
import type Item from '@core/Item';
|
import type Item from '@core/Item';
|
||||||
|
import type AnnotatedColumn from '@core/debug/AnnotatedColumn';
|
||||||
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
|
import ColumnAnnotation from '../../../core/src/debug/ColumnAnnotation';
|
||||||
|
import inView from '../actions/inView';
|
||||||
|
import { formatValue } from './formatValues';
|
||||||
|
|
||||||
export let schema: AnnotatedColumn[];
|
export let schema: AnnotatedColumn[];
|
||||||
export let itemsByPage: [number, Item[]][];
|
export let itemsByPage: [number, Item[]][];
|
||||||
@ -23,22 +24,6 @@
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function format(value: object) {
|
|
||||||
if (typeof value === 'number') {
|
|
||||||
return (value as number).toFixed(2);
|
|
||||||
}
|
|
||||||
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
|
||||||
let array = value as Array<object>;
|
|
||||||
if (array.length > 0 && typeof array[0] === 'number') {
|
|
||||||
array = (array.map((element) =>
|
|
||||||
((element as unknown) as number).toFixed(2)
|
|
||||||
) as unknown) as Array<object>;
|
|
||||||
}
|
|
||||||
return '[' + array.join(', ') + ']';
|
|
||||||
}
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
function calculateNextPageToRenderTo() {
|
function calculateNextPageToRenderTo() {
|
||||||
if (renderedMaxPage >= maxPage) {
|
if (renderedMaxPage >= maxPage) {
|
||||||
return;
|
return;
|
||||||
@ -88,7 +73,7 @@
|
|||||||
{/if}
|
{/if}
|
||||||
<td>{itemIdx}</td>
|
<td>{itemIdx}</td>
|
||||||
{#each schema as column}
|
{#each schema as column}
|
||||||
<td class="select-all">{format(item.data[column.name])}</td>
|
<td class="select-all">{formatValue(item.data[column.name])}</td>
|
||||||
{/each}
|
{/each}
|
||||||
</tr>
|
</tr>
|
||||||
{/each}
|
{/each}
|
||||||
|
16
ui/src/debug/formatValues.ts
Normal file
16
ui/src/debug/formatValues.ts
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
export function formatValue(value: object) {
|
||||||
|
if (Number.isInteger(value)) {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
if (typeof value === 'number') {
|
||||||
|
return (value as number).toFixed(2);
|
||||||
|
}
|
||||||
|
if (typeof value === 'object' && typeof Array.isArray(value)) {
|
||||||
|
let array = value as Array<object>;
|
||||||
|
if (array.length > 0 && typeof array[0] === 'number') {
|
||||||
|
array = (array.map((element) => ((element as unknown) as number).toFixed(2)) as unknown) as Array<object>;
|
||||||
|
}
|
||||||
|
return '[' + array.join(', ') + ']';
|
||||||
|
}
|
||||||
|
return value;
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user