mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-06-25 12:01:45 +02:00
Improve line compaction
This commit is contained in:
parent
0910f7b148
commit
6e5e5c9d53
@ -3,7 +3,7 @@ module.exports = {
|
|||||||
testEnvironment: 'node',
|
testEnvironment: 'node',
|
||||||
roots: ['./test'],
|
roots: ['./test'],
|
||||||
transform: { '\\.ts$': ['ts-jest'] },
|
transform: { '\\.ts$': ['ts-jest'] },
|
||||||
testRegex: '(/test/.*|(\\.|/)(test|spec))\\.(ts)$',
|
testRegex: '(/test/.*|(\\.|/))(test)\\.(ts)$',
|
||||||
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
|
moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
|
||||||
moduleNameMapper: {
|
moduleNameMapper: {
|
||||||
'src/(.*)': '<rootDir>/src/$1',
|
'src/(.*)': '<rootDir>/src/$1',
|
||||||
|
@ -1,6 +1,4 @@
|
|||||||
import { assertDefined } from '../assert';
|
|
||||||
import type Item from '../Item';
|
import type Item from '../Item';
|
||||||
import type ItemMerger from '../ItemMerger';
|
|
||||||
|
|
||||||
export default class ItemGroup {
|
export default class ItemGroup {
|
||||||
top: Item;
|
top: Item;
|
||||||
|
@ -10,7 +10,7 @@ export default class CompactLines extends ItemTransformer {
|
|||||||
'Compact Lines',
|
'Compact Lines',
|
||||||
'Combines items on the same y-axis',
|
'Combines items on the same y-axis',
|
||||||
{
|
{
|
||||||
requireColumns: ['str', 'y'],
|
requireColumns: ['str', 'y', 'height'],
|
||||||
itemMerger: {
|
itemMerger: {
|
||||||
groupKey: 'line',
|
groupKey: 'line',
|
||||||
merge: mergeLineItems,
|
merge: mergeLineItems,
|
||||||
@ -28,20 +28,23 @@ export default class CompactLines extends ItemTransformer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
transform(_: TransformContext, inputItems: Item[]): ItemResult {
|
||||||
|
let lines = 0;
|
||||||
return {
|
return {
|
||||||
items: transformGroupedByPage(inputItems, (page, items) => {
|
items: transformGroupedByPage(inputItems, (page, items) => {
|
||||||
let lineNumber = -1;
|
let lineNumber = -1;
|
||||||
let lastY: number | undefined;
|
let lastY: number | undefined;
|
||||||
return items.map((item) => {
|
return items.map((item) => {
|
||||||
const y = item.data['y'];
|
const y = item.data['y'];
|
||||||
if (!lastY || y < lastY) {
|
const height = item.data['height'];
|
||||||
|
if (!lastY || lastY - height > y) {
|
||||||
lineNumber++;
|
lineNumber++;
|
||||||
|
lines++;
|
||||||
}
|
}
|
||||||
lastY = y;
|
lastY = y;
|
||||||
return item.withDataAddition({ line: lineNumber });
|
return item.withDataAddition({ line: lineNumber });
|
||||||
});
|
});
|
||||||
}),
|
}),
|
||||||
messages: [],
|
messages: [`Formed ${lines} lines out of ${inputItems.length} items`],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,41 +1,92 @@
|
|||||||
import Item from 'src/Item';
|
import Item from 'src/Item';
|
||||||
import CompactLines from 'src/transformer/CompactLines';
|
import CompactLines from 'src/transformer/CompactLines';
|
||||||
|
import { emptyContext } from './testContext';
|
||||||
|
import { items } from './testItems';
|
||||||
|
|
||||||
|
const transformer = new CompactLines();
|
||||||
|
|
||||||
|
test('Transform - raised characters (example.pdf)', async () => {
|
||||||
|
const results = transformer.transform(
|
||||||
|
emptyContext(),
|
||||||
|
items(0, [
|
||||||
|
{
|
||||||
|
x: 240,
|
||||||
|
y: 585,
|
||||||
|
str: 'Dies ist eine Test-PDF',
|
||||||
|
height: 11,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
x: 352.69,
|
||||||
|
y: 585,
|
||||||
|
str: '.',
|
||||||
|
height: 11,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
x: 348,
|
||||||
|
y: 588,
|
||||||
|
str: '1',
|
||||||
|
height: 7.33,
|
||||||
|
},
|
||||||
|
{ x: 208, y: 572, str: 'Für’s Testen des', height: 11 },
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
expect(results.items.map((item) => item.data['line'])).toEqual([0, 0, 0, 1]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('Transform - lowered charactes (dict.pdf)', async () => {
|
||||||
|
const results = transformer.transform(
|
||||||
|
emptyContext(),
|
||||||
|
items(0, [
|
||||||
|
{ str: 'Let', x: 100.35, y: 625.05, height: 11.96 },
|
||||||
|
{ str: 'D', x: 122.38, y: 625.05, height: 11.96 },
|
||||||
|
{ str: '(', x: 100.35, y: 610.61, height: 11.96 },
|
||||||
|
{ str: 'v', x: 104.9, y: 610.61, height: 11.96 },
|
||||||
|
{ str: '0', x: 110.57, y: 608.82, height: 7.97 },
|
||||||
|
{ str: ', a', x: 115.29, y: 610.61, height: 11.96 },
|
||||||
|
{ str: 'all are different,', x: 100.35, y: 596.16, height: 11.96 },
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
expect(results.items.map((item) => item.data['line'])).toEqual([0, 0, 1, 1, 1, 1, 2]);
|
||||||
|
});
|
||||||
|
|
||||||
test('Item Merger', async () => {
|
test('Item Merger', async () => {
|
||||||
const itemMerger = new CompactLines().descriptor.itemMerger;
|
const itemMerger = transformer.descriptor.itemMerger;
|
||||||
expect(itemMerger?.groupKey).toEqual('line');
|
expect(itemMerger?.groupKey).toEqual('line');
|
||||||
|
|
||||||
const mergedItem = itemMerger?.merge([
|
const mergedItem = itemMerger?.merge(
|
||||||
new Item(0, {
|
items(0, [
|
||||||
line: 2,
|
{
|
||||||
x: 240,
|
line: 2,
|
||||||
y: 585,
|
x: 240,
|
||||||
str: 'Dies ist eine Test-PDF',
|
y: 585,
|
||||||
fontName: 'g_d0_f2',
|
str: 'Dies ist eine Test-PDF',
|
||||||
dir: 'ltr',
|
fontName: 'g_d0_f2',
|
||||||
width: 108.62,
|
dir: 'ltr',
|
||||||
height: 11,
|
width: 108.62,
|
||||||
}),
|
height: 11,
|
||||||
new Item(0, {
|
},
|
||||||
line: 2,
|
{
|
||||||
x: 352.69,
|
line: 2,
|
||||||
y: 585,
|
x: 352.69,
|
||||||
str: '.',
|
y: 585,
|
||||||
fontName: 'g_d0_f2',
|
str: '.',
|
||||||
dir: 'ltr',
|
fontName: 'g_d0_f2',
|
||||||
width: 3.06,
|
dir: 'ltr',
|
||||||
height: 11,
|
width: 3.06,
|
||||||
}),
|
height: 11,
|
||||||
new Item(0, {
|
},
|
||||||
line: 2,
|
{
|
||||||
x: 348,
|
line: 2,
|
||||||
y: 588,
|
x: 348,
|
||||||
str: '1',
|
y: 588,
|
||||||
fontName: 'g_d0_f2',
|
str: '1',
|
||||||
dir: 'ltr',
|
fontName: 'g_d0_f2',
|
||||||
width: 4.08,
|
dir: 'ltr',
|
||||||
height: 7.33,
|
width: 4.08,
|
||||||
}),
|
height: 7.33,
|
||||||
]);
|
},
|
||||||
|
]),
|
||||||
|
);
|
||||||
expect(mergedItem?.withoutUuid()).toEqual(
|
expect(mergedItem?.withoutUuid()).toEqual(
|
||||||
new Item(0, {
|
new Item(0, {
|
||||||
line: 2,
|
line: 2,
|
||||||
|
8
core/test/transformer/testContext.ts
Normal file
8
core/test/transformer/testContext.ts
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
import type TransformContext from "src/transformer/TransformContext";
|
||||||
|
|
||||||
|
export function emptyContext():TransformContext{
|
||||||
|
return {
|
||||||
|
fontMap:new Map(),
|
||||||
|
pageViewports:[]
|
||||||
|
};
|
||||||
|
}
|
5
core/test/transformer/testItems.ts
Normal file
5
core/test/transformer/testItems.ts
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
import Item from 'src/Item';
|
||||||
|
|
||||||
|
export function items(page: number, data: object[]): Item[] {
|
||||||
|
return data.map((data) => new Item(page, data));
|
||||||
|
}
|
@ -5,7 +5,6 @@
|
|||||||
import { BookOpen, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
import { BookOpen, ArrowLeft, ArrowRight } from 'svelte-hero-icons';
|
||||||
|
|
||||||
import type Debugger from '@core/Debugger';
|
import type Debugger from '@core/Debugger';
|
||||||
import type Item from '@core/Item';
|
|
||||||
import { asPages } from '../../../core/src/support/itemUtils';
|
import { asPages } from '../../../core/src/support/itemUtils';
|
||||||
|
|
||||||
import Popup from '../components/Popup.svelte';
|
import Popup from '../components/Popup.svelte';
|
||||||
|
Loading…
x
Reference in New Issue
Block a user