mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-25 12:01:45 +02:00
Sort line items on X axis
This commit is contained in:
parent
08509953dc
commit
915827be0c
@ -8,8 +8,15 @@ import AdjustHeight from './transformer/AdjustHeight';
|
|||||||
import CalculateCoordinates from './transformer/CalculateCoordinates';
|
import CalculateCoordinates from './transformer/CalculateCoordinates';
|
||||||
import CalculateStatistics from './transformer/CacluclateStatistics';
|
import CalculateStatistics from './transformer/CacluclateStatistics';
|
||||||
import CompactLines from './transformer/CompactLines';
|
import CompactLines from './transformer/CompactLines';
|
||||||
|
import SortXWithinLines from './transformer/SortXWithinLines';
|
||||||
|
|
||||||
const transformers = [new AdjustHeight(), new CalculateCoordinates(), new CalculateStatistics(), new CompactLines()];
|
const transformers = [
|
||||||
|
new AdjustHeight(),
|
||||||
|
new CalculateCoordinates(),
|
||||||
|
new CalculateStatistics(),
|
||||||
|
new CompactLines(),
|
||||||
|
new SortXWithinLines(),
|
||||||
|
];
|
||||||
|
|
||||||
const defaultConfig: Config = {
|
const defaultConfig: Config = {
|
||||||
pdfjsParams: {
|
pdfjsParams: {
|
||||||
|
@ -3,12 +3,14 @@ import Item from '../Item';
|
|||||||
import ItemGroup from './ItemGroup';
|
import ItemGroup from './ItemGroup';
|
||||||
import Page from './Page';
|
import Page from './Page';
|
||||||
|
|
||||||
|
type KeyExtractor = (item: Item) => any;
|
||||||
type PageItemTransformer = (page: number, items: Item[]) => Item[];
|
type PageItemTransformer = (page: number, items: Item[]) => Item[];
|
||||||
|
type LineItemTransformer = (page: number, line: number, items: Item[]) => Item[];
|
||||||
|
|
||||||
export function groupByPage(items: Item[]): Item[][] {
|
function groupBy(items: Item[], extractKey: KeyExtractor): Item[][] {
|
||||||
return items.reduce((pageItems: Item[][], item: Item) => {
|
return items.reduce((pageItems: Item[][], item: Item) => {
|
||||||
const lastPageItems = pageItems[pageItems.length - 1];
|
const lastPageItems = pageItems[pageItems.length - 1];
|
||||||
if (!lastPageItems || item.page > lastPageItems[0]?.page) {
|
if (!lastPageItems || extractKey(item) !== extractKey(lastPageItems[0])) {
|
||||||
pageItems.push([item]);
|
pageItems.push([item]);
|
||||||
} else {
|
} else {
|
||||||
lastPageItems.push(item);
|
lastPageItems.push(item);
|
||||||
@ -17,16 +19,12 @@ export function groupByPage(items: Item[]): Item[][] {
|
|||||||
}, []);
|
}, []);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function groupByPage(items: Item[]): Item[][] {
|
||||||
|
return groupBy(items, (item) => item.page);
|
||||||
|
}
|
||||||
|
|
||||||
export function groupByElement(items: Item[], elementName: string): Item[][] {
|
export function groupByElement(items: Item[], elementName: string): Item[][] {
|
||||||
return items.reduce((groupedItems: Item[][], item: Item) => {
|
return groupBy(items, (item) => item.data[elementName]);
|
||||||
const lastGroupItems = groupedItems[groupedItems.length - 1];
|
|
||||||
if (!lastGroupItems || item.data[elementName] !== lastGroupItems[0]?.data[elementName]) {
|
|
||||||
groupedItems.push([item]);
|
|
||||||
} else {
|
|
||||||
lastGroupItems.push(item);
|
|
||||||
}
|
|
||||||
return groupedItems;
|
|
||||||
}, []);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer): Item[] {
|
export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer): Item[] {
|
||||||
@ -35,6 +33,16 @@ export function transformGroupedByPage(items: Item[], groupedTransformer: PageIt
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export function transformGroupedByPageAndLine(items: Item[], groupedTransformer: LineItemTransformer): Item[] {
|
||||||
|
let transformedItems: Item[] = [];
|
||||||
|
groupByPage(items).forEach((pageItems) => {
|
||||||
|
groupByElement(pageItems, 'line').forEach((lineItems) => {
|
||||||
|
transformedItems.push(...groupedTransformer(pageItems[0].page, lineItems[0].data['line'], lineItems));
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return transformedItems;
|
||||||
|
}
|
||||||
|
|
||||||
export function asPages(items: Item[], itemMerger?: ItemMerger): Page[] {
|
export function asPages(items: Item[], itemMerger?: ItemMerger): Page[] {
|
||||||
return groupByPage(items).map((pageItems: Item[]) => {
|
return groupByPage(items).map((pageItems: Item[]) => {
|
||||||
let itemGroups: ItemGroup[];
|
let itemGroups: ItemGroup[];
|
||||||
|
26
core/src/transformer/SortXWithinLines.ts
Normal file
26
core/src/transformer/SortXWithinLines.ts
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
import Item from '../Item';
|
||||||
|
import ItemResult from '../ItemResult';
|
||||||
|
import ItemTransformer from './ItemTransformer';
|
||||||
|
import TransformContext from './TransformContext';
|
||||||
|
import { transformGroupedByPageAndLine } from '../support/itemUtils';
|
||||||
|
|
||||||
|
export default class SortXWithinLines extends ItemTransformer {
|
||||||
|
constructor() {
|
||||||
|
super('Sort by X', 'Sorts the items of a line by the x coordinate', {
|
||||||
|
requireColumns: ['line', 'x'],
|
||||||
|
// itemMerger: {
|
||||||
|
// groupKey: 'line',
|
||||||
|
// merge: mergeLineItems,
|
||||||
|
// },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||||
|
return {
|
||||||
|
items: transformGroupedByPageAndLine(inputItems, (_, __, items) => {
|
||||||
|
return items.sort((a, b) => a.data['x'] - b.data['x']);
|
||||||
|
}),
|
||||||
|
messages: [],
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
@ -1,9 +1,16 @@
|
|||||||
import Item from 'src/Item';
|
import Item from 'src/Item';
|
||||||
import Page from 'src/support/Page';
|
import Page from 'src/support/Page';
|
||||||
import { groupByPage, groupByElement, transformGroupedByPage, asPages } from 'src/support/itemUtils';
|
import {
|
||||||
|
groupByPage,
|
||||||
|
groupByElement,
|
||||||
|
transformGroupedByPage,
|
||||||
|
transformGroupedByPageAndLine,
|
||||||
|
asPages,
|
||||||
|
} from 'src/support/itemUtils';
|
||||||
import ItemGroup from 'src/support/ItemGroup';
|
import ItemGroup from 'src/support/ItemGroup';
|
||||||
import ItemMerger from 'src/ItemMerger';
|
import ItemMerger from 'src/ItemMerger';
|
||||||
import ItemTransformer from 'src/transformer/ItemTransformer';
|
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||||
|
import { items } from 'test/transformer/testItems';
|
||||||
|
|
||||||
describe('groupByPage', () => {
|
describe('groupByPage', () => {
|
||||||
test('empty', async () => {
|
test('empty', async () => {
|
||||||
@ -68,6 +75,30 @@ describe('transformGroupedByPage', () => {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe('transformGroupedByPageAndLine', () => {
|
||||||
|
test('empty', async () => {
|
||||||
|
const transformedItems = transformGroupedByPageAndLine([], () => fail("shoudln't be called"));
|
||||||
|
expect(transformedItems).toEqual([]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('transform', async () => {
|
||||||
|
const pageItems = [
|
||||||
|
items(0, [{ line: 1, id: 1 }]),
|
||||||
|
items(1, [
|
||||||
|
{ line: 1, id: 2 },
|
||||||
|
{ line: 1, id: 3 },
|
||||||
|
{ line: 2, id: 4 },
|
||||||
|
]),
|
||||||
|
items(2, [{ line: 1, id: 5 }]),
|
||||||
|
];
|
||||||
|
const flattenedItems = new Array<Item>().concat(...pageItems);
|
||||||
|
const transformedItems = transformGroupedByPageAndLine(flattenedItems, (page, line, items) => {
|
||||||
|
return [new Item(0, { group: `${page}/${line}:${items.length}` })];
|
||||||
|
});
|
||||||
|
expect(transformedItems.map((item) => item.data['group'])).toEqual(['0/1:1', '1/1:2', '1/2:1', '2/1:1']);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
describe('asPages', () => {
|
describe('asPages', () => {
|
||||||
test('empty', async () => {
|
test('empty', async () => {
|
||||||
expect(groupByPage([])).toEqual([]);
|
expect(groupByPage([])).toEqual([]);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user