mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-02-23 05:10:47 +01:00
Improve line compaction
This commit is contained in:
parent
0910f7b148
commit
6e5e5c9d53
@ -3,7 +3,7 @@ module.exports = {
|
||||
testEnvironment: 'node',
|
||||
roots: ['./test'],
|
||||
transform: { '\\.ts$': ['ts-jest'] },
|
||||
testRegex: '(/test/.*|(\\.|/)(test|spec))\\.(ts)$',
|
||||
testRegex: '(/test/.*|(\\.|/))(test)\\.(ts)$',
|
||||
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
|
||||
moduleNameMapper: {
|
||||
'src/(.*)': '<rootDir>/src/$1',
|
||||
|
@ -1,6 +1,4 @@
|
||||
import { assertDefined } from '../assert';
|
||||
import type Item from '../Item';
|
||||
import type ItemMerger from '../ItemMerger';
|
||||
|
||||
export default class ItemGroup {
|
||||
top: Item;
|
||||
|
@ -10,7 +10,7 @@ export default class CompactLines extends ItemTransformer {
|
||||
'Compact Lines',
|
||||
'Combines items on the same y-axis',
|
||||
{
|
||||
requireColumns: ['str', 'y'],
|
||||
requireColumns: ['str', 'y', 'height'],
|
||||
itemMerger: {
|
||||
groupKey: 'line',
|
||||
merge: mergeLineItems,
|
||||
@ -28,20 +28,23 @@ export default class CompactLines extends ItemTransformer {
|
||||
}
|
||||
|
||||
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||
let lines = 0;
|
||||
return {
|
||||
items: transformGroupedByPage(inputItems, (page, items) => {
|
||||
let lineNumber = -1;
|
||||
let lastY: number | undefined;
|
||||
return items.map((item) => {
|
||||
const y = item.data['y'];
|
||||
if (!lastY || y < lastY) {
|
||||
const height = item.data['height'];
|
||||
if (!lastY || lastY - height > y) {
|
||||
lineNumber++;
|
||||
lines++;
|
||||
}
|
||||
lastY = y;
|
||||
return item.withDataAddition({ line: lineNumber });
|
||||
});
|
||||
}),
|
||||
messages: [],
|
||||
messages: [`Formed ${lines} lines out of ${inputItems.length} items`],
|
||||
};
|
||||
}
|
||||
}
|
||||
|
@ -1,41 +1,92 @@
|
||||
import Item from 'src/Item';
|
||||
import CompactLines from 'src/transformer/CompactLines';
|
||||
import { emptyContext } from './testContext';
|
||||
import { items } from './testItems';
|
||||
|
||||
const transformer = new CompactLines();
|
||||
|
||||
test('Transform - raised characters (example.pdf)', async () => {
|
||||
const results = transformer.transform(
|
||||
emptyContext(),
|
||||
items(0, [
|
||||
{
|
||||
x: 240,
|
||||
y: 585,
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
x: 352.69,
|
||||
y: 585,
|
||||
str: '.',
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
x: 348,
|
||||
y: 588,
|
||||
str: '1',
|
||||
height: 7.33,
|
||||
},
|
||||
{ x: 208, y: 572, str: 'Für’s Testen des', height: 11 },
|
||||
]),
|
||||
);
|
||||
expect(results.items.map((item) => item.data['line'])).toEqual([0, 0, 0, 1]);
|
||||
});
|
||||
|
||||
test('Transform - lowered charactes (dict.pdf)', async () => {
|
||||
const results = transformer.transform(
|
||||
emptyContext(),
|
||||
items(0, [
|
||||
{ str: 'Let', x: 100.35, y: 625.05, height: 11.96 },
|
||||
{ str: 'D', x: 122.38, y: 625.05, height: 11.96 },
|
||||
{ str: '(', x: 100.35, y: 610.61, height: 11.96 },
|
||||
{ str: 'v', x: 104.9, y: 610.61, height: 11.96 },
|
||||
{ str: '0', x: 110.57, y: 608.82, height: 7.97 },
|
||||
{ str: ', a', x: 115.29, y: 610.61, height: 11.96 },
|
||||
{ str: 'all are different,', x: 100.35, y: 596.16, height: 11.96 },
|
||||
]),
|
||||
);
|
||||
expect(results.items.map((item) => item.data['line'])).toEqual([0, 0, 1, 1, 1, 1, 2]);
|
||||
});
|
||||
|
||||
test('Item Merger', async () => {
|
||||
const itemMerger = new CompactLines().descriptor.itemMerger;
|
||||
const itemMerger = transformer.descriptor.itemMerger;
|
||||
expect(itemMerger?.groupKey).toEqual('line');
|
||||
|
||||
const mergedItem = itemMerger?.merge([
|
||||
new Item(0, {
|
||||
line: 2,
|
||||
x: 240,
|
||||
y: 585,
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 108.62,
|
||||
height: 11,
|
||||
}),
|
||||
new Item(0, {
|
||||
line: 2,
|
||||
x: 352.69,
|
||||
y: 585,
|
||||
str: '.',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 3.06,
|
||||
height: 11,
|
||||
}),
|
||||
new Item(0, {
|
||||
line: 2,
|
||||
x: 348,
|
||||
y: 588,
|
||||
str: '1',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 4.08,
|
||||
height: 7.33,
|
||||
}),
|
||||
]);
|
||||
const mergedItem = itemMerger?.merge(
|
||||
items(0, [
|
||||
{
|
||||
line: 2,
|
||||
x: 240,
|
||||
y: 585,
|
||||
str: 'Dies ist eine Test-PDF',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 108.62,
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
line: 2,
|
||||
x: 352.69,
|
||||
y: 585,
|
||||
str: '.',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 3.06,
|
||||
height: 11,
|
||||
},
|
||||
{
|
||||
line: 2,
|
||||
x: 348,
|
||||
y: 588,
|
||||
str: '1',
|
||||
fontName: 'g_d0_f2',
|
||||
dir: 'ltr',
|
||||
width: 4.08,
|
||||
height: 7.33,
|
||||
},
|
||||
]),
|
||||
);
|
||||
expect(mergedItem?.withoutUuid()).toEqual(
|
||||
new Item(0, {
|
||||
line: 2,
|
||||
|
8
core/test/transformer/testContext.ts
Normal file
8
core/test/transformer/testContext.ts
Normal file
@ -0,0 +1,8 @@
|
||||
import type TransformContext from "src/transformer/TransformContext";
|
||||
|
||||
export function emptyContext():TransformContext{
|
||||
return {
|
||||
fontMap:new Map(),
|
||||
pageViewports:[]
|
||||
};
|
||||
}
|
5
core/test/transformer/testItems.ts
Normal file
5
core/test/transformer/testItems.ts
Normal file
@ -0,0 +1,5 @@
|
||||
import Item from 'src/Item';
|
||||
|
||||
export function items(page: number, data: object[]): Item[] {
|
||||
return data.map((data) => new Item(page, data));
|
||||
}
|
@ -5,7 +5,6 @@
|
||||
import { BookOpen, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
||||
|
||||
import type Debugger from '@core/Debugger';
|
||||
import type Item from '@core/Item';
|
||||
import { asPages } from '../../../core/src/support/itemUtils';
|
||||
|
||||
import Popup from '../components/Popup.svelte';
|
||||
|
Loading…
Reference in New Issue
Block a user