From 77576ebd7e36fb555231e2604cfea23d9027722a Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Thu, 16 Mar 2017 07:08:46 +0100 Subject: [PATCH] [WIP] Headlines for title pages --- .../transformations/DetectHeadlines.jsx | 11 ++++++ .../textitem/DetectHeaders.jsx | 36 +++++++++++++++++-- 2 files changed, 44 insertions(+), 3 deletions(-) diff --git a/src/javascript/models/transformations/DetectHeadlines.jsx b/src/javascript/models/transformations/DetectHeadlines.jsx index 8dd6d5e..304ee63 100644 --- a/src/javascript/models/transformations/DetectHeadlines.jsx +++ b/src/javascript/models/transformations/DetectHeadlines.jsx @@ -123,6 +123,17 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation { } +function findPagesWithMaxHeight(pages, maxHeight) { + const maxHeaderPagesSet = new Set(); + pages.forEach(page => { + page.items.forEach(block => { + if (!block.type && block.textItems[0].height == maxHeight) { + maxHeaderPagesSet.add(page); + } + }); + }); + return maxHeaderPagesSet; +} function convertMaxHeaders(pages, maxHeight, mostUsedHeight) { // Find pages with max height const maxHeaderPagesSet = new Set(); diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx index eb7d51c..0d6be44 100644 --- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx +++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx @@ -14,17 +14,16 @@ export default class DetectListItems extends ToTextItemTransformation { } transform(parseResult:ParseResult) { - const {tocPages, headlineTypeToHeightRange, mostUsedHeight} = parseResult.globals; + const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals; var detectedHeaders = 0; if (tocPages.length > 0) { - //apply existing headline heights to find additional headlines + //Use existing headline heights to find additional headlines const headlineTypes = Object.keys(headlineTypeToHeightRange); headlineTypes.forEach(headlineType => { var range = headlineTypeToHeightRange[headlineType]; - // if (range.min > mostUsedHeight && range.max - range.min <= 1) { //use only very clear headlines if (range.max > mostUsedHeight) { //use only very clear headlines, only use max parseResult.pages.forEach(page => { page.items.forEach(textItem => { @@ -40,6 +39,24 @@ export default class DetectListItems extends ToTextItemTransformation { }); } + // Handle title pages + const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight); + const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); + pagesWithMaxHeight.forEach(titlePage => { + titlePage.items.forEach(textItem => { + const height = textItem.height; + if (!textItem.type && height > min2ndLevelHeaderHeigthOnMaxPage) { + if (height == maxHeight) { + textItem.type = ElementType.H1; + } else { + textItem.type = ElementType.H2; + } + textItem.annotation = DETECTED_ANNOTATION; + detectedHeaders++; + } + }); + }); + return new ParseResult({ ...parseResult, messages: [ @@ -51,3 +68,16 @@ export default class DetectListItems extends ToTextItemTransformation { } } + +function findPagesWithMaxHeight(pages, maxHeight) { + const maxHeaderPagesSet = new Set(); + pages.forEach(page => { + page.items.forEach(textItem => { + if (!textItem.type && textItem.height == maxHeight) { + maxHeaderPagesSet.add(page); + } + }); + }); + return maxHeaderPagesSet; +} +