mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-23 05:10:47 +01:00
Sort line items on X axis
This commit is contained in:
parent
08509953dc
commit
915827be0c
@ -8,8 +8,15 @@ import AdjustHeight from './transformer/AdjustHeight';
|
||||
import CalculateCoordinates from './transformer/CalculateCoordinates';
|
||||
import CalculateStatistics from './transformer/CacluclateStatistics';
|
||||
import CompactLines from './transformer/CompactLines';
|
||||
import SortXWithinLines from './transformer/SortXWithinLines';
|
||||
|
||||
const transformers = [new AdjustHeight(), new CalculateCoordinates(), new CalculateStatistics(), new CompactLines()];
|
||||
const transformers = [
|
||||
new AdjustHeight(),
|
||||
new CalculateCoordinates(),
|
||||
new CalculateStatistics(),
|
||||
new CompactLines(),
|
||||
new SortXWithinLines(),
|
||||
];
|
||||
|
||||
const defaultConfig: Config = {
|
||||
pdfjsParams: {
|
||||
|
@ -3,12 +3,14 @@ import Item from '../Item';
|
||||
import ItemGroup from './ItemGroup';
|
||||
import Page from './Page';
|
||||
|
||||
type KeyExtractor = (item: Item) => any;
|
||||
type PageItemTransformer = (page: number, items: Item[]) => Item[];
|
||||
type LineItemTransformer = (page: number, line: number, items: Item[]) => Item[];
|
||||
|
||||
export function groupByPage(items: Item[]): Item[][] {
|
||||
function groupBy(items: Item[], extractKey: KeyExtractor): Item[][] {
|
||||
return items.reduce((pageItems: Item[][], item: Item) => {
|
||||
const lastPageItems = pageItems[pageItems.length - 1];
|
||||
if (!lastPageItems || item.page > lastPageItems[0]?.page) {
|
||||
if (!lastPageItems || extractKey(item) !== extractKey(lastPageItems[0])) {
|
||||
pageItems.push([item]);
|
||||
} else {
|
||||
lastPageItems.push(item);
|
||||
@ -17,16 +19,12 @@ export function groupByPage(items: Item[]): Item[][] {
|
||||
}, []);
|
||||
}
|
||||
|
||||
export function groupByPage(items: Item[]): Item[][] {
|
||||
return groupBy(items, (item) => item.page);
|
||||
}
|
||||
|
||||
export function groupByElement(items: Item[], elementName: string): Item[][] {
|
||||
return items.reduce((groupedItems: Item[][], item: Item) => {
|
||||
const lastGroupItems = groupedItems[groupedItems.length - 1];
|
||||
if (!lastGroupItems || item.data[elementName] !== lastGroupItems[0]?.data[elementName]) {
|
||||
groupedItems.push([item]);
|
||||
} else {
|
||||
lastGroupItems.push(item);
|
||||
}
|
||||
return groupedItems;
|
||||
}, []);
|
||||
return groupBy(items, (item) => item.data[elementName]);
|
||||
}
|
||||
|
||||
export function transformGroupedByPage(items: Item[], groupedTransformer: PageItemTransformer): Item[] {
|
||||
@ -35,6 +33,16 @@ export function transformGroupedByPage(items: Item[], groupedTransformer: PageIt
|
||||
);
|
||||
}
|
||||
|
||||
export function transformGroupedByPageAndLine(items: Item[], groupedTransformer: LineItemTransformer): Item[] {
|
||||
let transformedItems: Item[] = [];
|
||||
groupByPage(items).forEach((pageItems) => {
|
||||
groupByElement(pageItems, 'line').forEach((lineItems) => {
|
||||
transformedItems.push(...groupedTransformer(pageItems[0].page, lineItems[0].data['line'], lineItems));
|
||||
});
|
||||
});
|
||||
return transformedItems;
|
||||
}
|
||||
|
||||
export function asPages(items: Item[], itemMerger?: ItemMerger): Page[] {
|
||||
return groupByPage(items).map((pageItems: Item[]) => {
|
||||
let itemGroups: ItemGroup[];
|
||||
|
26
core/src/transformer/SortXWithinLines.ts
Normal file
26
core/src/transformer/SortXWithinLines.ts
Normal file
@ -0,0 +1,26 @@
|
||||
import Item from '../Item';
|
||||
import ItemResult from '../ItemResult';
|
||||
import ItemTransformer from './ItemTransformer';
|
||||
import TransformContext from './TransformContext';
|
||||
import { transformGroupedByPageAndLine } from '../support/itemUtils';
|
||||
|
||||
export default class SortXWithinLines extends ItemTransformer {
|
||||
constructor() {
|
||||
super('Sort by X', 'Sorts the items of a line by the x coordinate', {
|
||||
requireColumns: ['line', 'x'],
|
||||
// itemMerger: {
|
||||
// groupKey: 'line',
|
||||
// merge: mergeLineItems,
|
||||
// },
|
||||
});
|
||||
}
|
||||
|
||||
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||
return {
|
||||
items: transformGroupedByPageAndLine(inputItems, (_, __, items) => {
|
||||
return items.sort((a, b) => a.data['x'] - b.data['x']);
|
||||
}),
|
||||
messages: [],
|
||||
};
|
||||
}
|
||||
}
|
@ -1,9 +1,16 @@
|
||||
import Item from 'src/Item';
|
||||
import Page from 'src/support/Page';
|
||||
import { groupByPage, groupByElement, transformGroupedByPage, asPages } from 'src/support/itemUtils';
|
||||
import {
|
||||
groupByPage,
|
||||
groupByElement,
|
||||
transformGroupedByPage,
|
||||
transformGroupedByPageAndLine,
|
||||
asPages,
|
||||
} from 'src/support/itemUtils';
|
||||
import ItemGroup from 'src/support/ItemGroup';
|
||||
import ItemMerger from 'src/ItemMerger';
|
||||
import ItemTransformer from 'src/transformer/ItemTransformer';
|
||||
import { items } from 'test/transformer/testItems';
|
||||
|
||||
describe('groupByPage', () => {
|
||||
test('empty', async () => {
|
||||
@ -68,6 +75,30 @@ describe('transformGroupedByPage', () => {
|
||||
});
|
||||
});
|
||||
|
||||
describe('transformGroupedByPageAndLine', () => {
|
||||
test('empty', async () => {
|
||||
const transformedItems = transformGroupedByPageAndLine([], () => fail("shoudln't be called"));
|
||||
expect(transformedItems).toEqual([]);
|
||||
});
|
||||
|
||||
test('transform', async () => {
|
||||
const pageItems = [
|
||||
items(0, [{ line: 1, id: 1 }]),
|
||||
items(1, [
|
||||
{ line: 1, id: 2 },
|
||||
{ line: 1, id: 3 },
|
||||
{ line: 2, id: 4 },
|
||||
]),
|
||||
items(2, [{ line: 1, id: 5 }]),
|
||||
];
|
||||
const flattenedItems = new Array<Item>().concat(...pageItems);
|
||||
const transformedItems = transformGroupedByPageAndLine(flattenedItems, (page, line, items) => {
|
||||
return [new Item(0, { group: `${page}/${line}:${items.length}` })];
|
||||
});
|
||||
expect(transformedItems.map((item) => item.data['group'])).toEqual(['0/1:1', '1/1:2', '1/2:1', '2/1:1']);
|
||||
});
|
||||
});
|
||||
|
||||
describe('asPages', () => {
|
||||
test('empty', async () => {
|
||||
expect(groupByPage([])).toEqual([]);
|
||||
|
Loading…
Reference in New Issue
Block a user