mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2025-01-03 20:28:54 +01:00
[WIP] headline detection for non TOC pdfs
This commit is contained in:
parent
77576ebd7e
commit
4600dc6ee7
@ -123,17 +123,6 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
||||
|
||||
}
|
||||
|
||||
function findPagesWithMaxHeight(pages, maxHeight) {
|
||||
const maxHeaderPagesSet = new Set();
|
||||
pages.forEach(page => {
|
||||
page.items.forEach(block => {
|
||||
if (!block.type && block.textItems[0].height == maxHeight) {
|
||||
maxHeaderPagesSet.add(page);
|
||||
}
|
||||
});
|
||||
});
|
||||
return maxHeaderPagesSet;
|
||||
}
|
||||
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
|
||||
// Find pages with max height
|
||||
const maxHeaderPagesSet = new Set();
|
@ -1,13 +1,11 @@
|
||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||
import ParseResult from '../../ParseResult.jsx';
|
||||
import TextItem from '../../TextItem.jsx';
|
||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||
import ElementType from '../../ElementType.jsx';
|
||||
import { isHeadline, headlineByLevel } from '../../ElementType.jsx';
|
||||
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
||||
import { headlineByLevel } from '../../ElementType.jsx';
|
||||
|
||||
//Detect items starting with -, •, etc...
|
||||
export default class DetectListItems extends ToTextItemTransformation {
|
||||
export default class DetectHeaders extends ToTextItemTransformation {
|
||||
|
||||
constructor() {
|
||||
super("Detect Headers");
|
||||
@ -15,30 +13,9 @@ export default class DetectListItems extends ToTextItemTransformation {
|
||||
|
||||
transform(parseResult:ParseResult) {
|
||||
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
|
||||
|
||||
const hasToc = tocPages.length > 0;
|
||||
var detectedHeaders = 0;
|
||||
|
||||
if (tocPages.length > 0) {
|
||||
|
||||
//Use existing headline heights to find additional headlines
|
||||
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||
headlineTypes.forEach(headlineType => {
|
||||
var range = headlineTypeToHeightRange[headlineType];
|
||||
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == range.max) {
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.enumValueOf(headlineType);
|
||||
detectedHeaders++
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
// Handle title pages
|
||||
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
||||
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||
@ -57,11 +34,58 @@ export default class DetectListItems extends ToTextItemTransformation {
|
||||
});
|
||||
});
|
||||
|
||||
if (hasToc) {
|
||||
//Use existing headline heights to find additional headlines
|
||||
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||
headlineTypes.forEach(headlineType => {
|
||||
var range = headlineTypeToHeightRange[headlineType];
|
||||
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == range.max) {
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = ElementType.enumValueOf(headlineType);
|
||||
detectedHeaders++
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
});
|
||||
} else {
|
||||
const heights = [];
|
||||
var lastHeight;
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height > mostUsedHeight) {
|
||||
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
|
||||
heights.push(textItem.height);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
heights.sort((a, b) => b - a);
|
||||
|
||||
heights.forEach((height, i) => {
|
||||
const headlineType = headlineByLevel(2 + i);
|
||||
parseResult.pages.forEach(page => {
|
||||
page.items.forEach(textItem => {
|
||||
if (!textItem.type && textItem.height == height) {
|
||||
detectedHeaders++;
|
||||
textItem.annotation = DETECTED_ANNOTATION;
|
||||
textItem.type = headlineType;
|
||||
}
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
|
||||
return new ParseResult({
|
||||
...parseResult,
|
||||
messages: [
|
||||
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
|
||||
'Detected ' + detectedHeaders + ' headlines.'
|
||||
'Detected ' + detectedHeaders + ' headlines.',
|
||||
]
|
||||
});
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user