mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-17 10:20:47 +01:00
Make LineItemMerger standalone and re-usable
This commit is contained in:
parent
e6a18fa0d8
commit
229cb53eb0
@ -1,6 +0,0 @@
|
||||
import type Item from './Item';
|
||||
|
||||
export default interface ItemMerger {
|
||||
groupKey: string;
|
||||
merge(items: Item[]): Item;
|
||||
}
|
@ -1,4 +1,11 @@
|
||||
import type ItemMerger from './ItemMerger';
|
||||
import type ItemMerger from './support/ItemMerger';
|
||||
|
||||
interface Debug {
|
||||
/**
|
||||
* If this is set, the debug UI will group items and display a merged item.
|
||||
*/
|
||||
readonly itemMerger?: ItemMerger;
|
||||
}
|
||||
|
||||
export default interface TransformDescriptor {
|
||||
readonly requireColumns: string[];
|
||||
@ -7,7 +14,7 @@ export default interface TransformDescriptor {
|
||||
/**
|
||||
* If this is set, the debug UI will group items and display a merged item.
|
||||
*/
|
||||
readonly itemMerger?: ItemMerger;
|
||||
readonly debug?: Debug;
|
||||
}
|
||||
|
||||
const defaults: TransformDescriptor = {
|
||||
|
9
core/src/support/ItemMerger.ts
Normal file
9
core/src/support/ItemMerger.ts
Normal file
@ -0,0 +1,9 @@
|
||||
import type Item from '../Item';
|
||||
|
||||
/**
|
||||
* Groups individual items and merges them to a kind of top level summary item.
|
||||
*/
|
||||
export default abstract class ItemMerger {
|
||||
constructor(public groupKey: string) {}
|
||||
abstract merge(items: Item[]): Item;
|
||||
}
|
30
core/src/support/LineItemMerger.ts
Normal file
30
core/src/support/LineItemMerger.ts
Normal file
@ -0,0 +1,30 @@
|
||||
import ItemMerger from './ItemMerger';
|
||||
import Item from '../Item';
|
||||
|
||||
export default class LineItemMerger extends ItemMerger {
|
||||
constructor() {
|
||||
super('line');
|
||||
}
|
||||
|
||||
merge(items: Item[]): Item {
|
||||
const page = items[0].page;
|
||||
const line = items[0].data['line'];
|
||||
const str = items.map((item) => item.data['str']).join(' ');
|
||||
const x = Math.min(...items.map((item) => item.data['x']));
|
||||
const y = Math.min(...items.map((item) => item.data['y']));
|
||||
const width = items.reduce((sum, item) => sum + item.data['width'], 0);
|
||||
const height = Math.max(...items.map((item) => item.data['height']));
|
||||
const fontNames = [...new Set(items.map((item) => item.data['fontName']))];
|
||||
const directions = [...new Set(items.map((item) => item.data['dir']))];
|
||||
return new Item(page, {
|
||||
str,
|
||||
line,
|
||||
x,
|
||||
y,
|
||||
width,
|
||||
height,
|
||||
fontName: fontNames,
|
||||
dir: directions,
|
||||
});
|
||||
}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
import ItemMerger from '../ItemMerger';
|
||||
import ItemMerger from './ItemMerger';
|
||||
import Item from '../Item';
|
||||
import ItemGroup from './ItemGroup';
|
||||
import Page from './Page';
|
||||
|
@ -3,6 +3,7 @@ import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import { transformGroupedByPage } from '../support/itemUtils';
|
||||
import LineItemMerger from '../support/LineItemMerger';
|
||||
|
||||
export default class CompactLines extends ItemTransformer {
|
||||
constructor() {
|
||||
@ -11,9 +12,8 @@ export default class CompactLines extends ItemTransformer {
|
||||
'Combines items on the same y-axis',
|
||||
{
|
||||
requireColumns: ['str', 'y', 'height'],
|
||||
itemMerger: {
|
||||
groupKey: 'line',
|
||||
merge: mergeLineItems,
|
||||
debug: {
|
||||
itemMerger: new LineItemMerger(),
|
||||
},
|
||||
},
|
||||
(incomingSchema) => {
|
||||
@ -48,25 +48,3 @@ export default class CompactLines extends ItemTransformer {
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
function mergeLineItems(items: Item[]): Item {
|
||||
const page = items[0].page;
|
||||
const line = items[0].data['line'];
|
||||
const str = items.map((item) => item.data['str']).join(' ');
|
||||
const x = Math.min(...items.map((item) => item.data['x']));
|
||||
const y = Math.min(...items.map((item) => item.data['y']));
|
||||
const width = items.reduce((sum, item) => sum + item.data['width'], 0);
|
||||
const height = Math.max(...items.map((item) => item.data['height']));
|
||||
const fontNames = [...new Set(items.map((item) => item.data['fontName']))];
|
||||
const directions = [...new Set(items.map((item) => item.data['dir']))];
|
||||
return new Item(page, {
|
||||
str,
|
||||
line,
|
||||
x,
|
||||
y,
|
||||
width,
|
||||
height,
|
||||
fontName: fontNames,
|
||||
dir: directions,
|
||||
});
|
||||
}
|
||||
|
@ -2,16 +2,16 @@ import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import LineItemMerger from '../support/LineItemMerger';
|
||||
import { transformGroupedByPageAndLine } from '../support/itemUtils';
|
||||
|
||||
export default class SortXWithinLines extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Sort by X', 'Sorts the items of a line by the x coordinate', {
|
||||
requireColumns: ['line', 'x'],
|
||||
// itemMerger: {
|
||||
// groupKey: 'line',
|
||||
// merge: mergeLineItems,
|
||||
// },
|
||||
debug: {
|
||||
itemMerger: new LineItemMerger(),
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
|
56
core/test/debug/LineItemMerger.test.ts
Normal file
56
core/test/debug/LineItemMerger.test.ts
Normal file
@ -0,0 +1,56 @@
|
||||
import LineItemMerger from 'src/debug/LineItemMerger';
|
||||
import Item from 'src/Item';
|
||||
import { items } from '../testItems';
|
||||
|
||||
const itemMerger = new LineItemMerger();
|
||||
|
||||
test('Basics', async () => {
|
||||
expect(itemMerger.groupKey).toEqual('line');
|
||||
|
||||
const mergedItem = itemMerger?.merge(
|
||||
items(0, [
|
||||
{
|
||||
line: 2,
|
||||
x: 240,
|
||||
y: 585,
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 108.62,
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
line: 2,
|
||||
x: 352.69,
|
||||
y: 585,
|
||||
str: '.',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 3.06,
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
line: 2,
|
||||
x: 348,
|
||||
y: 588,
|
||||
str: '1',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 4.08,
|
||||
height: 7.33,
|
||||
},
|
||||
]),
|
||||
);
|
||||
expect(mergedItem?.withoutUuid()).toEqual(
|
||||
new Item(0, {
|
||||
line: 2,
|
||||
x: 240,
|
||||
y: 585,
|
||||
str: 'Dies ist eine Test-PDF . 1',
|
||||
fontName: ['g_d0_f2'],
|
||||
dir: ['ltr'],
|
||||
width: 115.76,
|
||||
height: 11,
|
||||
}).withoutUuid(),
|
||||
);
|
||||
});
|
@ -8,7 +8,7 @@ import {
|
||||
asPages,
|
||||
} from 'src/support/itemUtils';
|
||||
import ItemGroup from 'src/support/ItemGroup';
|
||||
import ItemMerger from 'src/ItemMerger';
|
||||
import ItemMerger from 'src/support/ItemMerger';
|
||||
import { items } from 'test/testItems';
|
||||
|
||||
describe('groupByPage', () => {
|
||||
|
@ -1,4 +1,3 @@
|
||||
import Item from 'src/Item';
|
||||
import CompactLines from 'src/transformer/CompactLines';
|
||||
import { emptyContext } from './testContext';
|
||||
import { items } from '../testItems';
|
||||
@ -69,55 +68,3 @@ test('Transform - lowered charactes (dict.pdf)', async () => {
|
||||
);
|
||||
expect(results.items.map((item) => item.data['line'])).toEqual([0, 0, 1, 1, 1, 1, 2]);
|
||||
});
|
||||
|
||||
test('Item Merger', async () => {
|
||||
const itemMerger = transformer.descriptor.itemMerger;
|
||||
expect(itemMerger?.groupKey).toEqual('line');
|
||||
|
||||
const mergedItem = itemMerger?.merge(
|
||||
items(0, [
|
||||
{
|
||||
line: 2,
|
||||
x: 240,
|
||||
y: 585,
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 108.62,
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
line: 2,
|
||||
x: 352.69,
|
||||
y: 585,
|
||||
str: '.',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 3.06,
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
line: 2,
|
||||
x: 348,
|
||||
y: 588,
|
||||
str: '1',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 4.08,
|
||||
height: 7.33,
|
||||
},
|
||||
]),
|
||||
);
|
||||
expect(mergedItem?.withoutUuid()).toEqual(
|
||||
new Item(0, {
|
||||
line: 2,
|
||||
x: 240,
|
||||
y: 585,
|
||||
str: 'Dies ist eine Test-PDF . 1',
|
||||
fontName: ['g_d0_f2'],
|
||||
dir: ['ltr'],
|
||||
width: 115.76,
|
||||
height: 11,
|
||||
}).withoutUuid(),
|
||||
);
|
||||
});
|
||||
|
@ -25,7 +25,7 @@
|
||||
$: stageResult = debug.stageResults(currentStage);
|
||||
$: pageIsPinned = !isNaN(pinnedPage);
|
||||
$: pagesNumbers = new Set(stageResult.items.map((item) => item.page));
|
||||
$: pages = asPages(stageResult.items, stageResult.descriptor?.itemMerger);
|
||||
$: pages = asPages(stageResult.items, stageResult.descriptor?.debug?.itemMerger);
|
||||
$: maxPage = Math.max(...pagesNumbers);
|
||||
$: visiblePages = pageIsPinned ? pages.filter((page) => page.index === pinnedPage) : pages;
|
||||
</script>
|
||||
|
Loading…
Reference in New Issue
Block a user