[WIP] Simplify major headline detections

This commit is contained in:
Johannes Zillmann 2017-03-15 05:27:47 +01:00
parent 5caf8154db
commit 739d20d83b
2 changed files with 28 additions and 215 deletions

View File

@ -1,170 +0,0 @@
import TextItem from './TextItem.jsx';
import { isNumber } from '../functions.jsx'
import { sortByX } from '../textItemFunctions.jsx'
//Combines text items which are on the same Y at the same time doing inline transformations like
//'whitespace removal', bold/emphasis annotation, link-detection, etc..
export default class TextItemCombiner {
constructor(options) {
this.transformEmphasis = options.transformEmphasis || true;
this.mostUsedDistance = options.mostUsedDistance || 12;
}
// returns a CombineResult
combine(textItems: TextItem[]) {
if (textItems.length == 0) {
return new CombineResult({
textItems: resultItems,
parsedElements: {}
});
}
const resultItems = [];
const [groupedItems, parsedElements] = this.groupByFollowingY(textItems);
groupedItems.forEach(itemGroup => {
if (itemGroup.length == 1) {
resultItems.push(itemGroup[0]);
} else {
var text = '';
var maxHeight = 0;
var widthSum = 0;
var lastItem;
itemGroup.forEach(item => {
if (lastItem && !text.endsWith(' ') && !item.text.startsWith(' ')) {
const xDistance = item.x - lastItem.x - lastItem.width;
if (xDistance >= 5) {
text += ' ';
}
}
text += item.text;
widthSum += item.width;
lastItem = item;
maxHeight = Math.max(maxHeight, item.height);
});
resultItems.push(new TextItem({
...itemGroup[0],
text: text,
height: maxHeight,
width: widthSum
}));
}
});
//TODO whitespace removal
//TODO bold/emphasis
return new CombineResult({
textItems: resultItems,
parsedElements: parsedElements
});
}
groupByFollowingY(textItems) {
const footnoteLinks = [];
const footnotes = [];
var lines = this.groupItemsByLine(textItems);
lines = lines.map(lineItems => {
const basicY = lineItems[0].y;
const newLineItems = [];
var stashedNumberItems = [];
const commitStashedNumbers = (nextItem) => {
if (stashedNumberItems.length > 0) {
const joinedNumber = stashedNumberItems.map(footnoteLinkItem => footnoteLinkItem.text).join('');
if (stashedNumberItems[0].y > basicY) { // footnote link
newLineItems.push(new TextItem({
...stashedNumberItems[0],
//TODO make fomatting configurable
// text: `<sup>[${joinedNumber}](#${joinedNumber})</sup>`
text: `^${joinedNumber}`
}));
footnoteLinks.push(parseInt(joinedNumber));
} else if (nextItem && nextItem.y < stashedNumberItems[0].y) { // footnote
//TODO womb comp [29] => ydiff == 0
newLineItems.push(new TextItem({
...stashedNumberItems[0],
text: `(^${ joinedNumber}):`
}));
footnotes.push(joinedNumber);
} else {
stashedNumberItems.forEach(number => newLineItems.push(number));
}
stashedNumberItems = [];
}
};
lineItems.forEach(item => {
if (newLineItems.length == 0 && item.text.trim().length == 0) {
// skip whitespace on the beginning of a line
} else {
const isANumber = isNumber(item.text);
if (isANumber) {
stashedNumberItems.push(item);
} else {
if (stashedNumberItems.length > 0) {
commitStashedNumbers(item);
}
newLineItems.push(item);
}
}
});
commitStashedNumbers();
return newLineItems;
});
return [lines, new ParsedElements({
footnoteLinks: footnoteLinks,
footnotes: footnotes
})];
}
groupItemsByLine(textItems:TextItem[]) {
const lines = [];
var currentLine = [];
textItems.forEach(item => {
if (currentLine.length > 0 && Math.abs(currentLine[0].y - item.y) >= this.mostUsedDistance / 2) {
lines.push(currentLine);
currentLine = [];
}
currentLine.push(item);
});
lines.push(currentLine);
lines.forEach(lineItems => {
// we can't trust order of occurence, esp. footnoteLinks like to come last
sortByX(lineItems);
});
return lines;
}
}
//Result of the TextItemCombiner#combine()
export class CombineResult {
constructor(options) {
this.textItems = options.textItems;
this.parsedElements = options.parsedElements;
}
}
export class ParsedElements {
constructor(options) {
this.footnoteLinks = options.footnoteLinks;
this.footnotes = options.footnotes;
}
add(parsedElements:ParsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
}
}

View File

@ -1,8 +1,7 @@
import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
import ParseResult from '../ParseResult.jsx'; import ParseResult from '../ParseResult.jsx';
import TextItemBlock from '../TextItemBlock.jsx'; import TextItemBlock from '../TextItemBlock.jsx';
import TextItemCombiner from '../TextItemCombiner.jsx'; import { ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../Annotation.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
import ElementType from '../ElementType.jsx'; import ElementType from '../ElementType.jsx';
import { headlineByLevel } from '../ElementType.jsx'; import { headlineByLevel } from '../ElementType.jsx';
@ -17,22 +16,19 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
var foundHeadlines = 0; var foundHeadlines = 0;
const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals; const {mostUsedHeight, mostUsedDistance, maxHeight, tocPages} = parseResult.globals;
const textCombiner = new TextItemCombiner({
mostUsedDistance: mostUsedDistance,
});
//Set max headlines (all headers on the same page are max level 2) //Set max headlines (all headers on the same page are max level 2)
const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight, textCombiner); const maxHeaderPages = convertMaxHeaders(parseResult.pages, maxHeight, mostUsedHeight);
var headlineHeightFlowBeforeToc = []; var headlineHeightFlowBeforeToc = [];
var headlineHeightsOccurenceBeforeToc = {}; var headlineHeightsOccurenceBeforeToc = {};
var firstPageAfterToc = 0; var firstPageAfterToc = 0;
if (tocPages && tocPages.length > 0) { if (tocPages && tocPages.length > 0) {
[headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], textCombiner, mostUsedHeight, maxHeaderPages); [headlineHeightFlowBeforeToc, headlineHeightsOccurenceBeforeToc] = calculateHeadlineHeigthFlow(parseResult.pages, 0, tocPages[0], mostUsedHeight, maxHeaderPages);
firstPageAfterToc = tocPages[tocPages.length - 1] + 1; firstPageAfterToc = tocPages[tocPages.length - 1] + 1;
} }
const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, textCombiner, mostUsedHeight, maxHeaderPages); const [headlineHeightFlowAfterToc, headlineHeightsOccurenceAfterToc] = calculateHeadlineHeigthFlow(parseResult.pages, firstPageAfterToc, parseResult.pages.length, mostUsedHeight, maxHeaderPages);
// TODO ==> do flow analysis (remove out of flow or snap, start with 2nd) // TODO ==> do flow analysis (remove out of flow or snap, start with 2nd)
@ -49,15 +45,15 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
page.items.forEach(block => { page.items.forEach(block => {
newBlocks.push(block); newBlocks.push(block);
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
const combineResult = textCombiner.combine(block.textItems); // const combineResult = textCombiner.combine(block.textItems);
if (combineResult.textItems.length == 1) { // if (combineResult.textItems.length == 1) {
const height = combineResult.textItems[0].height; // const height = combineResult.textItems[0].height;
if (height == maxHeight) { // if (height == maxHeight) {
block.annotation = REMOVED_ANNOTATION; // // block.annotation = REMOVED_ANNOTATION;
currentHeadlineLevel = 1; // currentHeadlineLevel = 1;
headlineSizePerLevel[currentHeadlineLevel] = height // headlineSizePerLevel[currentHeadlineLevel] = height
addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
} // }
// else if (currentHeadlineLevel) { // else if (currentHeadlineLevel) {
// const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel]; // const currentLevelSize = headlineSizePerLevel[currentHeadlineLevel];
// if (height < currentLevelSize) { // if (height < currentLevelSize) {
@ -79,7 +75,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
// addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel)); // addNewBlock(newBlocks, combineResult, headlineByLevel(currentHeadlineLevel));
// } // }
// } // }
} // }
} }
}); });
page.items = newBlocks; page.items = newBlocks;
@ -127,16 +123,7 @@ export default class DetectHeadlines extends ToTextItemBlockTransformation {
} }
function addNewBlock(newBlocks, combineResult, headlineLevel) { function convertMaxHeaders(pages, maxHeight, mostUsedHeight) {
newBlocks.push(new TextItemBlock({
textItems: combineResult.textItems,
type: headlineLevel,
annotation: ADDED_ANNOTATION,
parsedElements: combineResult.parsedElements
}));
}
function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
// Find pages with max height // Find pages with max height
const maxHeaderPagesSet = new Set(); const maxHeaderPagesSet = new Set();
pages.forEach(page => { pages.forEach(page => {
@ -150,27 +137,24 @@ function convertMaxHeaders(pages, maxHeight, mostUsedHeight, textCombiner) {
// Now convert those pages to headlines // Now convert those pages to headlines
const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4); const min2ndLevelHeaderHeigthOnMaxPage = mostUsedHeight + ((maxHeight - mostUsedHeight) / 4);
maxHeaderPagesSet.forEach(pageWithMaxHeader => { maxHeaderPagesSet.forEach(pageWithMaxHeader => {
const newBlocks = [];
pageWithMaxHeader.items.forEach(block => { pageWithMaxHeader.items.forEach(block => {
newBlocks.push(block); if (block.textItems.length == 1) {
const height = block.textItems[0].height; const height = block.textItems[0].height;
if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) { if (!block.type && height > min2ndLevelHeaderHeigthOnMaxPage) {
block.annotation = REMOVED_ANNOTATION; block.annotation = DETECTED_ANNOTATION;
const combineResult = textCombiner.combine(block.textItems); if (height == maxHeight) {
if (height == maxHeight) { block.type = ElementType.H1;
addNewBlock(newBlocks, combineResult, ElementType.H1); } else {
} else if (combineResult.textItems.length == 1) { block.type = ElementType.H2;
addNewBlock(newBlocks, combineResult, ElementType.H2); }
} }
} }
}); });
pageWithMaxHeader.items = newBlocks;
}); });
return Array.from(maxHeaderPagesSet).map(page => page.index + 1); return Array.from(maxHeaderPagesSet).map(page => page.index + 1);
} }
function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeight, maxHeaderPages) { function calculateHeadlineHeigthFlow(pages, from, to, mostUsedHeight, maxHeaderPages) {
const headlineHeightFlow = []; const headlineHeightFlow = [];
const headlineHeightsOccurences = {}; const headlineHeightsOccurences = {};
var lastHeadlineHeight; var lastHeadlineHeight;
@ -179,9 +163,8 @@ function calculateHeadlineHeigthFlow(pages, from, to, textCombiner, mostUsedHeig
if (!maxHeaderPages.includes(page.index + 1)) { if (!maxHeaderPages.includes(page.index + 1)) {
page.items.forEach(block => { page.items.forEach(block => {
if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) { if (!block.type && !block.annotation && block.textItems[0].height > mostUsedHeight) {
const combineResult = textCombiner.combine(block.textItems); if (block.textItems.length == 1) {
if (combineResult.textItems.length == 1) { const height = block.textItems[0].height;
const height = combineResult.textItems[0].height;
headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ; headlineHeightsOccurences[height] = headlineHeightsOccurences[height] ? headlineHeightsOccurences[height] + 1 : 1 ;
if (!lastHeadlineHeight || height != lastHeadlineHeight) { if (!lastHeadlineHeight || height != lastHeadlineHeight) {
headlineHeightFlow.push(height); headlineHeightFlow.push(height);