Files
2024-03-26 10:52:54 -06:00

139 lines
5.4 KiB
JavaScript

import ToLineItemTransformation from '../ToLineItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
import BlockType from '../../markdown/BlockType.jsx';
import { headlineByLevel } from '../../markdown/BlockType.jsx';
import { isListItem } from '../../../stringFunctions.jsx';
//Detect headlines based on heights
export default class DetectHeaders extends ToLineItemTransformation {
constructor() {
super("Detect Headers");
}
transform(parseResult:ParseResult) {
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, mostUsedDistance, mostUsedFont, maxHeight} = parseResult.globals;
const hasToc = tocPages.length > 0;
var detectedHeaders = 0;
// Handle title pages
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
pagesWithMaxHeight.forEach(titlePage => {
titlePage.items.forEach(item => {
const height = item.height;
if (!item.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
if (height == maxHeight) {
item.type = BlockType.H1;
} else {
item.type = BlockType.H2;
}
item.annotation = DETECTED_ANNOTATION;
detectedHeaders++;
}
});
});
if (hasToc) { //Use existing headline heights to find additional headlines
const headlineTypes = Object.keys(headlineTypeToHeightRange);
headlineTypes.forEach(headlineType => {
var range = headlineTypeToHeightRange[headlineType];
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
parseResult.pages.forEach(page => {
page.items.forEach(item => {
if (!item.type && item.height == range.max) {
item.annotation = DETECTED_ANNOTATION;
item.type = BlockType.enumValueOf(headlineType);
detectedHeaders++
}
});
});
}
});
} else { //Categorize headlines by the text heights
const heights = [];
var lastHeight;
parseResult.pages.forEach(page => {
page.items.forEach(item => {
if (!item.type && item.height > mostUsedHeight && !isListItem(item.text())) {
if (!heights.includes(item.height) && (!lastHeight || lastHeight > item.height)) {
heights.push(item.height);
}
}
});
});
heights.sort((a, b) => b - a);
heights.forEach((height, i) => {
const headlineLevel = i + 2;
if (headlineLevel <= 6) {
const headlineType = headlineByLevel(2 + i);
parseResult.pages.forEach(page => {
page.items.forEach(item => {
if (!item.type && item.height == height && !isListItem(item.text())) {
detectedHeaders++;
item.annotation = DETECTED_ANNOTATION;
item.type = headlineType;
}
});
});
}
});
}
//find headlines which have paragraph height
var smallesHeadlineLevel = 1;
parseResult.pages.forEach(page => {
page.items.forEach(item => {
if (item.type && item.type.headline) {
smallesHeadlineLevel = Math.max(smallesHeadlineLevel, item.type.headlineLevel);
}
});
});
if (smallesHeadlineLevel < 6) {
const nextHeadlineType = headlineByLevel(smallesHeadlineLevel + 1);
parseResult.pages.forEach(page => {
var lastItem;
page.items.forEach(item => {
if (!item.type
&& item.height == mostUsedHeight
&& item.font !== mostUsedFont
&& (!lastItem || lastItem.y < item.y || (lastItem.type && lastItem.type.headline) || (lastItem.y - item.y > mostUsedDistance * 2))
&& item.text() === item.text().toUpperCase()
) {
detectedHeaders++;
item.annotation = DETECTED_ANNOTATION;
item.type = nextHeadlineType;
}
lastItem = item;
});
});
}
return new ParseResult({
...parseResult,
messages: [
'Detected ' + detectedHeaders + ' headlines.',
]
});
}
}
function findPagesWithMaxHeight(pages, maxHeight) {
const maxHeaderPagesSet = new Set();
pages.forEach(page => {
page.items.forEach(item => {
if (!item.type && item.height == maxHeight) {
maxHeaderPagesSet.add(page);
}
});
});
return maxHeaderPagesSet;
}