From 4600dc6ee7b845c58c2144301f1ed2bc00706565 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Thu, 16 Mar 2017 07:38:36 +0100 Subject: [PATCH] [WIP] headline detection for non TOC pdfs --- .../{ => old}/DetectHeadlines.jsx | 11 --- .../textitem/DetectHeaders.jsx | 82 ++++++++++++------- 2 files changed, 53 insertions(+), 40 deletions(-) rename src/javascript/models/transformations/{ => old}/DetectHeadlines.jsx (96%) diff --git a/src/javascript/models/transformations/DetectHeadlines.jsx b/src/javascript/models/transformations/old/DetectHeadlines.jsx similarity index 96% rename from src/javascript/models/transformations/DetectHeadlines.jsx rename to src/javascript/models/transformations/old/DetectHeadlines.jsx index 304ee63..8dd6d5e 100644 --- a/src/javascript/models/transformations/DetectHeadlines.jsx +++ b/src/javascript/models/transformations/old/DetectHeadlines.jsx @@ -123,17 +123,6 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation { } -function findPagesWithMaxHeight(pages, maxHeight) { - const maxHeaderPagesSet = new Set(); - pages.forEach(page => { - page.items.forEach(block => { - if (!block.type && block.textItems[0].height == maxHeight) { - maxHeaderPagesSet.add(page); - } - }); - }); - return maxHeaderPagesSet; -} function convertMaxHeaders(pages, maxHeight, mostUsedHeight) { // Find pages with max height const maxHeaderPagesSet = new Set(); diff --git a/src/javascript/models/transformations/textitem/DetectHeaders.jsx b/src/javascript/models/transformations/textitem/DetectHeaders.jsx index 0d6be44..6c226e3 100644 --- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx +++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx @@ -1,13 +1,11 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; -import TextItem from '../../TextItem.jsx'; -import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx'; +import { DETECTED_ANNOTATION } from '../../Annotation.jsx'; import ElementType from '../../ElementType.jsx'; -import { isHeadline, headlineByLevel } from '../../ElementType.jsx'; -import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx'; +import { headlineByLevel } from '../../ElementType.jsx'; //Detect items starting with -, •, etc... -export default class DetectListItems extends ToTextItemTransformation { +export default class DetectHeaders extends ToTextItemTransformation { constructor() { super("Detect Headers"); @@ -15,30 +13,9 @@ export default class DetectListItems extends ToTextItemTransformation { transform(parseResult:ParseResult) { const {tocPages, headlineTypeToHeightRange, mostUsedHeight, maxHeight} = parseResult.globals; - + const hasToc = tocPages.length > 0; var detectedHeaders = 0; - if (tocPages.length > 0) { - - //Use existing headline heights to find additional headlines - const headlineTypes = Object.keys(headlineTypeToHeightRange); - headlineTypes.forEach(headlineType => { - var range = headlineTypeToHeightRange[headlineType]; - if (range.max > mostUsedHeight) { //use only very clear headlines, only use max - parseResult.pages.forEach(page => { - page.items.forEach(textItem => { - if (!textItem.type && textItem.height == range.max) { - textItem.annotation = DETECTED_ANNOTATION; - textItem.type = ElementType.enumValueOf(headlineType); - detectedHeaders++ - } - }); - }); - } - - }); - } - // Handle title pages const pagesWithMaxHeight = findPagesWithMaxHeight(parseResult.pages, maxHeight); const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); @@ -57,11 +34,58 @@ export default class DetectListItems extends ToTextItemTransformation { }); }); + if (hasToc) { + //Use existing headline heights to find additional headlines + const headlineTypes = Object.keys(headlineTypeToHeightRange); + headlineTypes.forEach(headlineType => { + var range = headlineTypeToHeightRange[headlineType]; + if (range.max > mostUsedHeight) { //use only very clear headlines, only use max + parseResult.pages.forEach(page => { + page.items.forEach(textItem => { + if (!textItem.type && textItem.height == range.max) { + textItem.annotation = DETECTED_ANNOTATION; + textItem.type = ElementType.enumValueOf(headlineType); + detectedHeaders++ + } + }); + }); + } + + }); + } else { + const heights = []; + var lastHeight; + parseResult.pages.forEach(page => { + page.items.forEach(textItem => { + if (!textItem.type && textItem.height > mostUsedHeight) { + if (!heights.includes(textItem.height) && (!lastHeight || lastHeight > textItem.height)) { + heights.push(textItem.height); + } + } + }); + }); + heights.sort((a, b) => b - a); + + heights.forEach((height, i) => { + const headlineType = headlineByLevel(2 + i); + parseResult.pages.forEach(page => { + page.items.forEach(textItem => { + if (!textItem.type && textItem.height == height) { + detectedHeaders++; + textItem.annotation = DETECTED_ANNOTATION; + textItem.type = headlineType; + } + }); + }); + }); + } + + + return new ParseResult({ ...parseResult, messages: [ - 'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange), - 'Detected ' + detectedHeaders + ' headlines.' + 'Detected ' + detectedHeaders + ' headlines.', ] });