mirror of
https://github.com/jzillmann/pdf-to-markdown.git
synced 2024-11-21 23:33:31 +01:00
[WIP] Headlines for title pages
This commit is contained in:
parent
1eda51c0b4
commit
77576ebd7e
@ -123,6 +123,17 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function findPagesWithMaxHeight(pages, maxHeight) {
|
||||||
|
const maxHeaderPagesSet = new Set();
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.items.forEach(block => {
|
||||||
|
if (!block.type && block.textItems[0].height == maxHeight) {
|
||||||
|
maxHeaderPagesSet.add(page);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return maxHeaderPagesSet;
|
||||||
|
}
|
||||||
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
|
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
|
||||||
// Find pages with max height
|
// Find pages with max height
|
||||||
const maxHeaderPagesSet = new Set();
|
const maxHeaderPagesSet = new Set();
|
||||||
|
@ -14,17 +14,16 @@ export default class DetectListItems extends ToTextItemTransformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
transform(parseResult:ParseResult) {
|
transform(parseResult:ParseResult) {
|
||||||
const {tocPages, headlineTypeToHeightRange, mostUsedHeight} = parseResult.globals;
|
const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
|
||||||
|
|
||||||
var detectedHeaders = 0;
|
var detectedHeaders = 0;
|
||||||
|
|
||||||
if (tocPages.length > 0) {
|
if (tocPages.length > 0) {
|
||||||
|
|
||||||
//apply existing headline heights to find additional headlines
|
//Use existing headline heights to find additional headlines
|
||||||
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
const headlineTypes = Object.keys(headlineTypeToHeightRange);
|
||||||
headlineTypes.forEach(headlineType => {
|
headlineTypes.forEach(headlineType => {
|
||||||
var range = headlineTypeToHeightRange[headlineType];
|
var range = headlineTypeToHeightRange[headlineType];
|
||||||
// if (range.min > mostUsedHeight && range.max - range.min <= 1) { //use only very clear headlines
|
|
||||||
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
|
||||||
parseResult.pages.forEach(page => {
|
parseResult.pages.forEach(page => {
|
||||||
page.items.forEach(textItem => {
|
page.items.forEach(textItem => {
|
||||||
@ -40,6 +39,24 @@ export default class DetectListItems extends ToTextItemTransformation {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Handle title pages
|
||||||
|
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
|
||||||
|
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
|
||||||
|
pagesWithMaxHeight.forEach(titlePage => {
|
||||||
|
titlePage.items.forEach(textItem => {
|
||||||
|
const height = textItem.height;
|
||||||
|
if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
|
||||||
|
if (height == maxHeight) {
|
||||||
|
textItem.type = ElementType.H1;
|
||||||
|
} else {
|
||||||
|
textItem.type = ElementType.H2;
|
||||||
|
}
|
||||||
|
textItem.annotation = DETECTED_ANNOTATION;
|
||||||
|
detectedHeaders++;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
return new ParseResult({
|
return new ParseResult({
|
||||||
...parseResult,
|
...parseResult,
|
||||||
messages: [
|
messages: [
|
||||||
@ -51,3 +68,16 @@ export default class DetectListItems extends ToTextItemTransformation {
|
|||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function findPagesWithMaxHeight(pages, maxHeight) {
|
||||||
|
const maxHeaderPagesSet = new Set();
|
||||||
|
pages.forEach(page => {
|
||||||
|
page.items.forEach(textItem => {
|
||||||
|
if (!textItem.type && textItem.height == maxHeight) {
|
||||||
|
maxHeaderPagesSet.add(page);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
return maxHeaderPagesSet;
|
||||||
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user