mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-07-13 04:25:11 +02:00
139 lines
5.4 KiB
JavaScript
139 lines
5.4 KiB
JavaScript
import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
|
|
import ParseResult from '../../ParseResult.jsx';
|
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
|
import BlockType from '../../markdown/BlockType.jsx';
|
|
import { headlineByLevel } from '../../markdown/BlockType.jsx';
|
|
import { isListItem } from '../../../stringFunctions.jsx';
|
|
|
|
//Detect headlines based on heights
|
|
export default class DetectHeaders extends ToLineItemTransformation {
|
|
|
|
constructor() {
|
|
super("Detect Headers");
|
|
}
|
|
|
|
transform(parseResult:ParseResult) {
|
|
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, mostUsedDistance, mostUsedFont, maxHeight} = parseResult.globals;
|
|
const hasToc = tocPages.length > 0;
|
|
var detectedHeaders = 0;
|
|
|
|
// Handle title pages
|
|
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
|
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
|
pagesWithMaxHeight.forEach(titlePage => {
|
|
titlePage.items.forEach(item => {
|
|
const height = item.height;
|
|
if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
|
if (height == maxHeight) {
|
|
item.type = BlockType.H1;
|
|
} else {
|
|
item.type = BlockType.H2;
|
|
}
|
|
item.annotation = DETECTED_ANNOTATION;
|
|
detectedHeaders++;
|
|
}
|
|
});
|
|
});
|
|
|
|
if (hasToc) { //Use existing headline heights to find additional headlines
|
|
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
|
headlineTypes.forEach(headlineType => {
|
|
var range = headlineTypeToHeightRange[headlineType];
|
|
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
|
parseResult.pages.forEach(page => {
|
|
page.items.forEach(item => {
|
|
if (!item.type && item.height == range.max) {
|
|
item.annotation = DETECTED_ANNOTATION;
|
|
item.type = BlockType.enumValueOf(headlineType);
|
|
detectedHeaders++
|
|
}
|
|
});
|
|
});
|
|
}
|
|
|
|
});
|
|
} else { //Categorize headlines by the text heights
|
|
const heights = [];
|
|
var lastHeight;
|
|
parseResult.pages.forEach(page => {
|
|
page.items.forEach(item => {
|
|
if (!item.type && item.height > mostUsedHeight && !isListItem(item.text())) {
|
|
if (!heights.includes(item.height) && (!lastHeight || lastHeight > item.height)) {
|
|
heights.push(item.height);
|
|
}
|
|
}
|
|
});
|
|
});
|
|
heights.sort((a, b) => b - a);
|
|
|
|
heights.forEach((height, i) => {
|
|
const headlineLevel = i + 2;
|
|
if (headlineLevel <= 6) {
|
|
const headlineType = headlineByLevel(2 + i);
|
|
parseResult.pages.forEach(page => {
|
|
page.items.forEach(item => {
|
|
if (!item.type && item.height == height && !isListItem(item.text())) {
|
|
detectedHeaders++;
|
|
item.annotation = DETECTED_ANNOTATION;
|
|
item.type = headlineType;
|
|
}
|
|
});
|
|
});
|
|
}
|
|
});
|
|
}
|
|
|
|
//find headlines which have paragraph height
|
|
var smallesHeadlineLevel = 1;
|
|
parseResult.pages.forEach(page => {
|
|
page.items.forEach(item => {
|
|
if (item.type && item.type.headline) {
|
|
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, item.type.headlineLevel);
|
|
}
|
|
});
|
|
});
|
|
if (smallesHeadlineLevel < 6) {
|
|
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
|
|
parseResult.pages.forEach(page => {
|
|
var lastItem;
|
|
page.items.forEach(item => {
|
|
if (!item.type
|
|
&& item.height == mostUsedHeight
|
|
&& item.font !== mostUsedFont
|
|
&& (!lastItem || lastItem.y < item.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - item.y > mostUsedDistance * 2))
|
|
&& item.text() === item.text().toUpperCase()
|
|
) {
|
|
detectedHeaders++;
|
|
item.annotation = DETECTED_ANNOTATION;
|
|
item.type = nextHeadlineType;
|
|
}
|
|
lastItem = item;
|
|
});
|
|
});
|
|
}
|
|
|
|
|
|
return new ParseResult({
|
|
...parseResult,
|
|
messages: [
|
|
'Detected ' + detectedHeaders + ' headlines.',
|
|
]
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
function findPagesWithMaxHeight(pages, maxHeight) {
|
|
const maxHeaderPagesSet = new Set();
|
|
pages.forEach(page => {
|
|
page.items.forEach(item => {
|
|
if (!item.type && item.height == maxHeight) {
|
|
maxHeaderPagesSet.add(page);
|
|
}
|
|
});
|
|
});
|
|
return maxHeaderPagesSet;
|
|
}
|
|
|