mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 15:23:26 +01:00
[WIP] headline detection for non TOC pdfs
This commit is contained in:
parent
77576ebd7e
commit
4600dc6ee7
@ -123,17 +123,6 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function findPagesWithMaxHeight(pages, maxHeight) {
|
|
||||||
const maxHeaderPagesSet = new Set();
|
|
||||||
pages.forEach(page => {
|
|
||||||
page.items.forEach(block => {
|
|
||||||
if (!block.type && block.textItems[0].height == maxHeight) {
|
|
||||||
maxHeaderPagesSet.add(page);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
return maxHeaderPagesSet;
|
|
||||||
}
|
|
||||||
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
|
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
|
||||||
// Find pages with max height
|
// Find pages with max height
|
||||||
const maxHeaderPagesSet = new Set();
|
const maxHeaderPagesSet = new Set();
|
@ -1,13 +1,11 @@
|
|||||||
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
|
||||||
import ParseResult from '../../ParseResult.jsx';
|
import ParseResult from '../../ParseResult.jsx';
|
||||||
import TextItem from '../../TextItem.jsx';
|
import { DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
||||||
import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
|
|
||||||
import ElementType from '../../ElementType.jsx';
|
import ElementType from '../../ElementType.jsx';
|
||||||
import { isHeadline, headlineByLevel } from '../../ElementType.jsx';
|
import { headlineByLevel } from '../../ElementType.jsx';
|
||||||
import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
|
|
||||||
|
|
||||||
//Detect items starting with -, •, etc...
|
//Detect items starting with -, •, etc...
|
||||||
export default class DetectListItems extends ToTextItemTransformation {
|
export default class DetectHeaders extends ToTextItemTransformation {
|
||||||
|
|
||||||
constructor() {
|
constructor() {
|
||||||
super("Detect Headers");
|
super("Detect Headers");
|
||||||
@ -15,30 +13,9 @@ export default class DetectListItems extends ToTextItemTransformation {
|
|||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
|
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
|
||||||
|
const hasToc = tocPages.length > 0;
|
||||||
var detectedHeaders = 0;
|
var detectedHeaders = 0;
|
||||||
|
|
||||||
if (tocPages.length > 0) {
|
|
||||||
|
|
||||||
//Use existing headline heights to find additional headlines
|
|
||||||
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
|
||||||
headlineTypes.forEach(headlineType => {
|
|
||||||
var range = headlineTypeToHeightRange[headlineType];
|
|
||||||
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
|
||||||
parseResult.pages.forEach(page => {
|
|
||||||
page.items.forEach(textItem => {
|
|
||||||
if (!textItem.type && textItem.height == range.max) {
|
|
||||||
textItem.annotation = DETECTED_ANNOTATION;
|
|
||||||
textItem.type = ElementType.enumValueOf(headlineType);
|
|
||||||
detectedHeaders++
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Handle title pages
|
// Handle title pages
|
||||||
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
||||||
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||||
@ -57,11 +34,58 @@ export default class DetectListItems extends ToTextItemTransformation {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (hasToc) {
|
||||||
|
//Use existing headline heights to find additional headlines
|
||||||
|
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||||
|
headlineTypes.forEach(headlineType => {
|
||||||
|
var range = headlineTypeToHeightRange[headlineType];
|
||||||
|
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
page.items.forEach(textItem => {
|
||||||
|
if (!textItem.type && textItem.height == range.max) {
|
||||||
|
textItem.annotation = DETECTED_ANNOTATION;
|
||||||
|
textItem.type = ElementType.enumValueOf(headlineType);
|
||||||
|
detectedHeaders++
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
const heights = [];
|
||||||
|
var lastHeight;
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
page.items.forEach(textItem => {
|
||||||
|
if (!textItem.type && textItem.height > mostUsedHeight) {
|
||||||
|
if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) {
|
||||||
|
heights.push(textItem.height);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
heights.sort((a, b) => b - a);
|
||||||
|
|
||||||
|
heights.forEach((height, i) => {
|
||||||
|
const headlineType = headlineByLevel(2 + i);
|
||||||
|
parseResult.pages.forEach(page => {
|
||||||
|
page.items.forEach(textItem => {
|
||||||
|
if (!textItem.type && textItem.height == height) {
|
||||||
|
detectedHeaders++;
|
||||||
|
textItem.annotation = DETECTED_ANNOTATION;
|
||||||
|
textItem.type = headlineType;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
messages: [
|
messages: [
|
||||||
'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
|
'Detected ' + detectedHeaders + ' headlines.',
|
||||||
'Detected ' + detectedHeaders + ' headlines.'
|
|
||||||
]
|
]
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user