From 739d20d83b1c68f902c2952931385bec0b35f36a Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Wed, 15 Mar 2017 05:27:47 +0100 Subject: [PATCH] [WIP] Simplify major headline detections --- src/javascript/models/TextItemCombiner.jsx | 170 ------------------ .../transformations/DetectHeadlines.jsx | 73 +++----- 2 files changed, 28 insertions(+), 215 deletions(-) delete mode 100644 src/javascript/models/TextItemCombiner.jsx diff --git a/src/javascript/models/TextItemCombiner.jsx b/src/javascript/models/TextItemCombiner.jsx deleted file mode 100644 index 4333fbc..0000000 --- a/src/javascript/models/TextItemCombiner.jsx +++ /dev/null @@ -1,170 +0,0 @@ -import TextItem from './TextItem.jsx'; -import { isNumber } from '../functions.jsx' -import { sortByX } from '../textItemFunctions.jsx' - -//Combines text items which are on the same Y at the same time doing inline transformations like -//'whitespace removal', bold/emphasis annotation, link-detection, etc.. -export default class TextItemCombiner { - - constructor(options) { - this.transformEmphasis = options.transformEmphasis || true; - this.mostUsedDistance = options.mostUsedDistance || 12; - } - - // returns a CombineResult - combine(textItems: TextItem[]) { - if (textItems.length == 0) { - return new CombineResult({ - textItems: resultItems, - parsedElements: {} - }); - } - const resultItems = []; - const [groupedItems, parsedElements] = this.groupByFollowingY(textItems); - groupedItems.forEach(itemGroup => { - if (itemGroup.length == 1) { - resultItems.push(itemGroup[0]); - } else { - var text = ''; - var maxHeight = 0; - var widthSum = 0; - var lastItem; - itemGroup.forEach(item => { - if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) { - const xDistance = item.x - lastItem.x - lastItem.width; - if (xDistance >= 5) { - text += ' '; - } - } - text += item.text; - widthSum += item.width; - lastItem = item; - maxHeight = Math.max(maxHeight, item.height); - }); - resultItems.push(new TextItem({ - ...itemGroup[0], - text: text, - height: maxHeight, - width: widthSum - })); - } - }); - - //TODO whitespace removal - //TODO bold/emphasis - - return new CombineResult({ - textItems: resultItems, - parsedElements: parsedElements - }); - } - - groupByFollowingY(textItems) { - const footnoteLinks = []; - const footnotes = []; - - - var lines = this.groupItemsByLine(textItems); - lines = lines.map(lineItems => { - const basicY = lineItems[0].y; - const newLineItems = []; - var stashedNumberItems = []; - - - const commitStashedNumbers = (nextItem) => { - if (stashedNumberItems.length > 0) { - const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join(''); - if (stashedNumberItems[0].y > basicY) { // footnote link - newLineItems.push(new TextItem({ - ...stashedNumberItems[0], - //TODO make fomatting configurable - // text: `[${joinedNumber}](#${joinedNumber})` - text: `^${joinedNumber}` - })); - footnoteLinks.push(parseInt(joinedNumber)); - } else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote - //TODO womb comp [29] => ydiff == 0 - newLineItems.push(new TextItem({ - ...stashedNumberItems[0], - text: `(^${ joinedNumber}):` - })); - footnotes.push(joinedNumber); - } else { - stashedNumberItems.forEach(number => newLineItems.push(number)); - } - - stashedNumberItems = []; - } - }; - - lineItems.forEach(item => { - if (newLineItems.length == 0 && item.text.trim().length == 0) { - // skip whitespace on the beginning of a line - } else { - const isANumber = isNumber(item.text); - if (isANumber) { - stashedNumberItems.push(item); - } else { - if (stashedNumberItems.length > 0) { - commitStashedNumbers(item); - } - newLineItems.push(item); - } - } - }); - commitStashedNumbers(); - return newLineItems; - }); - - - return [lines, new ParsedElements({ - footnoteLinks: footnoteLinks, - footnotes: footnotes - })]; - } - - groupItemsByLine(textItems:TextItem[]) { - const lines = []; - var currentLine = []; - textItems.forEach(item => { - if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) { - lines.push(currentLine); - currentLine = []; - } - currentLine.push(item); - }); - lines.push(currentLine); - - lines.forEach(lineItems => { - // we can't trust order of occurence, esp. footnoteLinks like to come last - sortByX(lineItems); - }); - return lines; - } - -} - -//Result of the TextItemCombiner#combine() -export class CombineResult { - - constructor(options) { - this.textItems = options.textItems; - this.parsedElements = options.parsedElements; - } - -} - -export class ParsedElements { - - constructor(options) { - this.footnoteLinks = options.footnoteLinks; - this.footnotes = options.footnotes; - } - - add(parsedElements:ParsedElements) { - this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); - this.footnotes = this.footnotes.concat(parsedElements.footnotes); - } - -} - diff --git a/src/javascript/models/transformations/DetectHeadlines.jsx b/src/javascript/models/transformations/DetectHeadlines.jsx index 1b82a57..8dd6d5e 100644 --- a/src/javascript/models/transformations/DetectHeadlines.jsx +++ b/src/javascript/models/transformations/DetectHeadlines.jsx @@ -1,8 +1,7 @@ import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; import TextItemBlock from '../TextItemBlock.jsx'; -import TextItemCombiner from '../TextItemCombiner.jsx'; -import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; +import { ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx'; import ElementType from '../ElementType.jsx'; import { headlineByLevel } from '../ElementType.jsx'; @@ -17,22 +16,19 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation { var foundHeadlines = 0; const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals; - const textCombiner = new TextItemCombiner({ - mostUsedDistance: mostUsedDistance, - }); - //Set max headlines (all headers on the same page are max level 2) - const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner); + const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight); + var headlineHeightFlowBeforeToc = []; var headlineHeightsOccurenceBeforeToc = {}; var firstPageAfterToc = 0; if (tocPages && tocPages.length > 0) { - [headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages); + [headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], mostUsedHeight, maxHeaderPages); firstPageAfterToc = tocPages[tocPages.length - 1] + 1; } - const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages); + const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, mostUsedHeight, maxHeaderPages); // TODO ==> do flow analysis (remove out of flow or snap, start with 2nd) @@ -49,15 +45,15 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation { page.items.forEach(block => { newBlocks.push(block); if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { - const combineResult = textCombiner.combine(block.textItems); - if (combineResult.textItems.length == 1) { - const height = combineResult.textItems[0].height; - if (height == maxHeight) { - block.annotation = REMOVED_ANNOTATION; - currentHeadlineLevel = 1; - headlineSizePerLevel[currentHeadlineLevel] = height - addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); - } + // const combineResult = textCombiner.combine(block.textItems); + // if (combineResult.textItems.length == 1) { + // const height = combineResult.textItems[0].height; + // if (height == maxHeight) { + // // block.annotation = REMOVED_ANNOTATION; + // currentHeadlineLevel = 1; + // headlineSizePerLevel[currentHeadlineLevel] = height + // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); + // } // else if (currentHeadlineLevel) { // const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel]; // if (height < currentLevelSize) { @@ -79,7 +75,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation { // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); // } // } - } + // } } }); page.items = newBlocks; @@ -127,16 +123,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation { } -function addNewBlock(newBlocks, combineResult, headlineLevel) { - newBlocks.push(new TextItemBlock({ - textItems: combineResult.textItems, - type: headlineLevel, - annotation: ADDED_ANNOTATION, - parsedElements: combineResult.parsedElements - })); -} - -function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) { +function convertMaxHeaders(pages, maxHeight, mostUsedHeight) { // Find pages with max height const maxHeaderPagesSet = new Set(); pages.forEach(page => { @@ -150,27 +137,24 @@ function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) { // Now convert those pages to headlines const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); maxHeaderPagesSet.forEach(pageWithMaxHeader => { - const newBlocks = []; pageWithMaxHeader.items.forEach(block => { - newBlocks.push(block); - const height = block.textItems[0].height; - if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) { - block.annotation = REMOVED_ANNOTATION; - const combineResult = textCombiner.combine(block.textItems); - if (height == maxHeight) { - addNewBlock(newBlocks, combineResult, ElementType.H1); - } else if (combineResult.textItems.length == 1) { - addNewBlock(newBlocks, combineResult, ElementType.H2); + if (block.textItems.length == 1) { + const height = block.textItems[0].height; + if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) { + block.annotation = DETECTED_ANNOTATION; + if (height == maxHeight) { + block.type = ElementType.H1; + } else { + block.type = ElementType.H2; + } } } }); - pageWithMaxHeader.items = newBlocks; }); - return Array.from(maxHeaderPagesSet).map(page => page.index + 1); } -function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) { +function calculateHeadlineHeigthFlow(pages, from, to, mostUsedHeight, maxHeaderPages) { const headlineHeightFlow = []; const headlineHeightsOccurences = {}; var lastHeadlineHeight; @@ -179,9 +163,8 @@ function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeig if (!maxHeaderPages.includes(page.index + 1)) { page.items.forEach(block => { if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { - const combineResult = textCombiner.combine(block.textItems); - if (combineResult.textItems.length == 1) { - const height = combineResult.textItems[0].height; + if (block.textItems.length == 1) { + const height = block.textItems[0].height; headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ; if (!lastHeadlineHeight || height != lastHeadlineHeight) { headlineHeightFlow.push(height);