[WIP] Simplify major headline detections

This commit is contained in:
Johannes Zillmann 2017-03-15 05:27:47 +01:00
parent 5caf8154db
commit 739d20d83b
2 changed files with 28 additions and 215 deletions

View File

@ -1,170 +0,0 @@
import TextItem from './TextItem.jsx';
import { isNumber } from '../functions.jsx'
import { sortByX } from '../textItemFunctions.jsx'
//Combines text items which are on the same Y at the same time doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemCombiner {
constructor(options) {
this.transformEmphasis = options.transformEmphasis || true;
this.mostUsedDistance = options.mostUsedDistance || 12;
}
// returns a CombineResult
combine(textItems: TextItem[]) {
if (textItems.length == 0) {
return new CombineResult({
textItems: resultItems,
parsedElements: {}
});
}
const resultItems = [];
const [groupedItems, parsedElements] = this.groupByFollowingY(textItems);
groupedItems.forEach(itemGroup => {
if (itemGroup.length == 1) {
resultItems.push(itemGroup[0]);
} else {
var text = '';
var maxHeight = 0;
var widthSum = 0;
var lastItem;
itemGroup.forEach(item => {
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
const xDistance = item.x - lastItem.x - lastItem.width;
if (xDistance >= 5) {
text += ' ';
}
}
text += item.text;
widthSum += item.width;
lastItem = item;
maxHeight = Math.max(maxHeight, item.height);
});
resultItems.push(new TextItem({
...itemGroup[0],
text: text,
height: maxHeight,
width: widthSum
}));
}
});
//TODO whitespace removal
//TODO bold/emphasis
return new CombineResult({
textItems: resultItems,
parsedElements: parsedElements
});
}
groupByFollowingY(textItems) {
const footnoteLinks = [];
const footnotes = [];
var lines = this.groupItemsByLine(textItems);
lines = lines.map(lineItems => {
const basicY = lineItems[0].y;
const newLineItems = [];
var stashedNumberItems = [];
const commitStashedNumbers = (nextItem) => {
if (stashedNumberItems.length > 0) {
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
if (stashedNumberItems[0].y > basicY) { // footnote link
newLineItems.push(new TextItem({
...stashedNumberItems[0],
//TODO make fomatting configurable
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
text: `^${joinedNumber}`
}));
footnoteLinks.push(parseInt(joinedNumber));
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
//TODO womb comp [29] => ydiff == 0
newLineItems.push(new TextItem({
...stashedNumberItems[0],
text: `(^${ joinedNumber}):`
}));
footnotes.push(joinedNumber);
} else {
stashedNumberItems.forEach(number => newLineItems.push(number));
}
stashedNumberItems = [];
}
};
lineItems.forEach(item => {
if (newLineItems.length == 0 && item.text.trim().length == 0) {
// skip whitespace on the beginning of a line
} else {
const isANumber = isNumber(item.text);
if (isANumber) {
stashedNumberItems.push(item);
} else {
if (stashedNumberItems.length > 0) {
commitStashedNumbers(item);
}
newLineItems.push(item);
}
}
});
commitStashedNumbers();
return newLineItems;
});
return [lines, new ParsedElements({
footnoteLinks: footnoteLinks,
footnotes: footnotes
})];
}
groupItemsByLine(textItems:TextItem[]) {
const lines = [];
var currentLine = [];
textItems.forEach(item => {
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
lines.push(currentLine);
currentLine = [];
}
currentLine.push(item);
});
lines.push(currentLine);
lines.forEach(lineItems => {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
});
return lines;
}
}
//Result of the TextItemCombiner#combine()
export class CombineResult {
constructor(options) {
this.textItems = options.textItems;
this.parsedElements = options.parsedElements;
}
}
export class ParsedElements {
constructor(options) {
this.footnoteLinks = options.footnoteLinks;
this.footnotes = options.footnotes;
}
add(parsedElements:ParsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
}
}

View File

@ -1,8 +1,7 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
import { ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx';
import { headlineByLevel } from '../ElementType.jsx';
@ -17,22 +16,19 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
var foundHeadlines = 0;
const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance,
});
//Set max headlines (all headers on the same page are max level 2)
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner);
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight);
var headlineHeightFlowBeforeToc = [];
var headlineHeightsOccurenceBeforeToc = {};
var firstPageAfterToc = 0;
if (tocPages && tocPages.length > 0) {
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages);
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], mostUsedHeight, maxHeaderPages);
firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
}
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages);
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, mostUsedHeight, maxHeaderPages);
// TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
@ -49,15 +45,15 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
page.items.forEach(block => {
newBlocks.push(block);
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
const combineResult = textCombiner.combine(block.textItems);
if (combineResult.textItems.length == 1) {
const height = combineResult.textItems[0].height;
if (height == maxHeight) {
block.annotation = REMOVED_ANNOTATION;
currentHeadlineLevel = 1;
headlineSizePerLevel[currentHeadlineLevel] = height
addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
}
// const combineResult = textCombiner.combine(block.textItems);
// if (combineResult.textItems.length == 1) {
// const height = combineResult.textItems[0].height;
// if (height == maxHeight) {
// // block.annotation = REMOVED_ANNOTATION;
// currentHeadlineLevel = 1;
// headlineSizePerLevel[currentHeadlineLevel] = height
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
// }
// else if (currentHeadlineLevel) {
// const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
// if (height < currentLevelSize) {
@ -79,7 +75,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
// }
// }
}
// }
}
});
page.items = newBlocks;
@ -127,16 +123,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
}
function addNewBlock(newBlocks, combineResult, headlineLevel) {
newBlocks.push(new TextItemBlock({
textItems: combineResult.textItems,
type: headlineLevel,
annotation: ADDED_ANNOTATION,
parsedElements: combineResult.parsedElements
}));
}
function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
// Find pages with max height
const maxHeaderPagesSet = new Set();
pages.forEach(page => {
@ -150,27 +137,24 @@ function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
// Now convert those pages to headlines
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
maxHeaderPagesSet.forEach(pageWithMaxHeader => {
const newBlocks = [];
pageWithMaxHeader.items.forEach(block => {
newBlocks.push(block);
if (block.textItems.length == 1) {
const height = block.textItems[0].height;
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
block.annotation = REMOVED_ANNOTATION;
const combineResult = textCombiner.combine(block.textItems);
block.annotation = DETECTED_ANNOTATION;
if (height == maxHeight) {
addNewBlock(newBlocks, combineResult, ElementType.H1);
} else if (combineResult.textItems.length == 1) {
addNewBlock(newBlocks, combineResult, ElementType.H2);
block.type = ElementType.H1;
} else {
block.type = ElementType.H2;
}
}
}
});
pageWithMaxHeader.items = newBlocks;
});
return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
}
function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) {
function calculateHeadlineHeigthFlow(pages, from, to, mostUsedHeight, maxHeaderPages) {
const headlineHeightFlow = [];
const headlineHeightsOccurences = {};
var lastHeadlineHeight;
@ -179,9 +163,8 @@ function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeig
if (!maxHeaderPages.includes(page.index + 1)) {
page.items.forEach(block => {
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
const combineResult = textCombiner.combine(block.textItems);
if (combineResult.textItems.length == 1) {
const height = combineResult.textItems[0].height;
if (block.textItems.length == 1) {
const height = block.textItems[0].height;
headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
if (!lastHeadlineHeight || height != lastHeadlineHeight) {
headlineHeightFlow.push(height);