[WIP] Headlines for title pages

This commit is contained in:
Johannes Zillmann 2017-03-16 07:08:46 +01:00
parent 1eda51c0b4
commit 77576ebd7e
2 changed files with 44 additions and 3 deletions

View File

@ -123,6 +123,17 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
} }
function findPagesWithMaxHeight(pages, maxHeight) {
const maxHeaderPagesSet = new Set();
pages.forEach(page => {
page.items.forEach(block => {
if (!block.type && block.textItems[0].height == maxHeight) {
maxHeaderPagesSet.add(page);
}
});
});
return maxHeaderPagesSet;
}
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) { function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
// Find pages with max height // Find pages with max height
const maxHeaderPagesSet = new Set(); const maxHeaderPagesSet = new Set();

View File

@ -14,17 +14,16 @@ export default class DetectListItems extends ToTextItemTransformation {
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
const {tocPages, headlineTypeToHeightRange, mostUsedHeight} = parseResult.globals; const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals;
var detectedHeaders = 0; var detectedHeaders = 0;
if (tocPages.length > 0) { if (tocPages.length > 0) {
//apply existing headline heights to find additional headlines //Use existing headline heights to find additional headlines
const headlineTypes = Object.keys(headlineTypeToHeightRange); const headlineTypes = Object.keys(headlineTypeToHeightRange);
headlineTypes.forEach(headlineType => { headlineTypes.forEach(headlineType => {
var range = headlineTypeToHeightRange[headlineType]; var range = headlineTypeToHeightRange[headlineType];
// if (range.min > mostUsedHeight && range.max - range.min <= 1) { //use only very clear headlines
if (range.max > mostUsedHeight) { //use only very clear headlines, only use max if (range.max > mostUsedHeight) { //use only very clear headlines, only use max
parseResult.pages.forEach(page => { parseResult.pages.forEach(page => {
page.items.forEach(textItem => { page.items.forEach(textItem => {
@ -40,6 +39,24 @@ export default class DetectListItems extends ToTextItemTransformation {
}); });
} }
// Handle title pages
const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight);
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
pagesWithMaxHeight.forEach(titlePage => {
titlePage.items.forEach(textItem => {
const height = textItem.height;
if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
if (height == maxHeight) {
textItem.type = ElementType.H1;
} else {
textItem.type = ElementType.H2;
}
textItem.annotation = DETECTED_ANNOTATION;
detectedHeaders++;
}
});
});
return new ParseResult({ return new ParseResult({
...parseResult, ...parseResult,
messages: [ messages: [
@ -51,3 +68,16 @@ export default class DetectListItems extends ToTextItemTransformation {
} }
} }
function findPagesWithMaxHeight(pages, maxHeight) {
const maxHeaderPagesSet = new Set();
pages.forEach(page => {
page.items.forEach(textItem => {
if (!textItem.type && textItem.height == maxHeight) {
maxHeaderPagesSet.add(page);
}
});
});
return maxHeaderPagesSet;
}