[WIP] robustify TOC headline finding

This commit is contained in:
Johannes Zillmann 2017-03-16 06:01:07 +01:00
parent dbd9d8bf5f
commit a9b851ceb6

View File

@ -113,21 +113,22 @@ export default class DetectTOC extends ToTextItemTransformation {
}); });
// Add linked headers // Add linked headers
const pageMapping = detectPageMappingNumber(parseResult.pages.filter(page => page.index > lastTocPage.index), tocLinks);
tocLinks.forEach(tocLink => { tocLinks.forEach(tocLink => {
var linkedPage = parseResult.pages[tocLink.pageNumber - 1]; var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping];
var foundHeadline = false; var foundHealineItems;
if (linkedPage) { if (linkedPage) {
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange); foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
if (!foundHeadline) { // pages are off by 1 ? if (!foundHealineItems) { // pages are off by 1 ?
linkedPage = parseResult.pages[tocLink.pageNumber]; linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1];
if (linkedPage) { if (linkedPage) {
foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange); foundHealineItems = findHeadlineItems(linkedPage, tocLink.textItem.text);
} }
} }
}
if (foundHealineItems) {
addHeadlineItems(linkedPage, tocLink, foundHealineItems, headlineTypeToHeightRange)
} else { } else {
//TODO sometimes pages are off. We could try the page range from pre to next ...
}
if (!foundHeadline) {
notFoundHeadlines.push(tocLink); notFoundHeadlines.push(tocLink);
} }
}); });
@ -192,8 +193,27 @@ export default class DetectTOC extends ToTextItemTransformation {
} }
function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) { //Find out how the TOC page link actualy translates to the page.index
const headline = tocLink.textItem.text; function detectPageMappingNumber(pages, tocLinks) {
for ( var tocLink of tocLinks ) {
const page = findPageWithHeadline(pages, tocLink.textItem.text);
if (page) {
return page.index - tocLink.pageNumber;
}
}
return null;
}
function findPageWithHeadline(pages, headline) {
for ( var page of pages ) {
if (findHeadlineItems(page, headline)) {
return page;
}
}
return null;
}
function findHeadlineItems(page, headline) {
const headlineFinder = new HeadlineFinder({ const headlineFinder = new HeadlineFinder({
headline: headline headline: headline
}); });
@ -201,12 +221,23 @@ function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
for ( var line of page.items ) { for ( var line of page.items ) {
const headlineItems = headlineFinder.consume(line); const headlineItems = headlineFinder.consume(line);
if (headlineItems) { if (headlineItems) {
headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION); return {
lineIndex: lineIndex,
headlineItems: headlineItems
};
}
lineIndex++;
}
return null;
}
function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange) {
foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
const headlineType = headlineByLevel(tocLink.level + 2); const headlineType = headlineByLevel(tocLink.level + 2);
const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0); const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
page.items.splice(lineIndex + 1, 0, new TextItem({ page.items.splice(foundItems.lineIndex + 1, 0, new TextItem({
...headlineItems[0], ...foundItems.headlineItems[0],
text: headline, text: tocLink.textItem.text,
height: headlineHeight, height: headlineHeight,
type: headlineType, type: headlineType,
annotation: ADDED_ANNOTATION annotation: ADDED_ANNOTATION
@ -222,11 +253,6 @@ function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
}; };
headlineTypeToHeightRange[headlineType] = range; headlineTypeToHeightRange[headlineType] = range;
} }
return true;
}
lineIndex++;
}
return false;
} }
function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) { function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {