import ToLineItemTransformation from '../ToLineItemTransformation.jsx'; import ParseResult from '../../ParseResult.jsx'; import LineItem from '../../LineItem.jsx'; import Word from '../../Word.jsx'; import HeadlineFinder from '../../HeadlineFinder.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; import BlockType from '../../markdown/BlockType.jsx'; import { headlineByLevel } from '../../markdown/BlockType.jsx'; import { isDigit, isNumber, wordMatch, hasOnly } from '../../../stringFunctions.jsx' //Detect table of contents pages plus linked headlines export default class DetectTOC extends ToLineItemTransformation { constructor() { super("Detect TOC"); } transform(parseResult:ParseResult) { const tocPages = []; const maxPagesToEvaluate = Math.min(20, parseResult.pages.length); const linkLeveler = new LinkLeveler(); var tocLinks = []; var lastTocPage; var headlineItem; parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => { var lineItemsWithDigits = 0; const unknownLines = new Set(); const pageTocLinks = []; var lastWordsWithoutNumber; var lastLine; //find lines ending with a number per page page.items.forEach(line => { var words = line.words.filter(word => !hasOnly(word.string, '.')); const digits = []; while (words.length > 0 && isNumber(words[words.length - 1].string)) { const lastWord = words.pop(); digits.unshift(lastWord.string); } if (digits.length == 0 && words.length > 0) { const lastWord = words[words.length - 1]; while (isDigit(lastWord.string.charCodeAt(lastWord.string.length - 1))) { digits.unshift(lastWord.string.charAt(lastWord.string.length - 1)) lastWord.string = lastWord.string.substring(0, lastWord.string.length - 1); } } var endsWithDigit = digits.length > 0; if (endsWithDigit) { endsWithDigit = true; if (lastWordsWithoutNumber) { // 2-line item ? words.push(...lastWordsWithoutNumber); lastWordsWithoutNumber = null; } pageTocLinks.push(new TocLink({ pageNumber: parseInt(digits.join('')), lineItem: new LineItem({ ...line, words: words }) })); lineItemsWithDigits++; } else { if (!headlineItem) { headlineItem = line; } else { if (lastWordsWithoutNumber) { unknownLines.add(lastLine); } lastWordsWithoutNumber = words; lastLine = line; } } }); // page has been processed if (lineItemsWithDigits * 100 / page.items.length > 75) { tocPages.push(page.index + 1); lastTocPage = page; linkLeveler.levelPageItems(pageTocLinks); tocLinks.push(...pageTocLinks); const newBlocks = []; page.items.forEach((line) => { if (!unknownLines.has(line)) { line.annotation = REMOVED_ANNOTATION; } newBlocks.push(line); if (line === headlineItem) { newBlocks.push(new LineItem({ ...line, type: BlockType.H2, annotation: ADDED_ANNOTATION })); } }); page.items = newBlocks; } else { headlineItem = null; } }); //all pages have been processed var foundHeadlines = tocLinks.length; const notFoundHeadlines = []; const foundBySize = []; const headlineTypeToHeightRange = {}; //H1={min:23, max:25} if (tocPages.length > 0) { // Add TOC items tocLinks.forEach(tocLink => { lastTocPage.items.push(new LineItem({ words: [new Word({ string: ' '.repeat(tocLink.level * 3) + '-' })].concat(tocLink.lineItem.words), type: BlockType.TOC, annotation: ADDED_ANNOTATION })); }); // Add linked headers const pageMapping = detectPageMappingNumber(parseResult.pages.filter(page => page.index > lastTocPage.index), tocLinks); tocLinks.forEach(tocLink => { var linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping]; var foundHealineItems; if (linkedPage) { foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text()); if (!foundHealineItems) { // pages are off by 1 ? linkedPage = parseResult.pages[tocLink.pageNumber + pageMapping + 1]; if (linkedPage) { foundHealineItems = findHeadlineItems(linkedPage, tocLink.lineItem.text()); } } } if (foundHealineItems) { addHeadlineItems(linkedPage, tocLink, foundHealineItems, headlineTypeToHeightRange) } else { notFoundHeadlines.push(tocLink); } }); // Try to find linked headers by height var fromPage = lastTocPage.index + 2; var lastNotFound = []; const rollupLastNotFound = (currentPageNumber) => { if (lastNotFound.length > 0) { lastNotFound.forEach(notFoundTocLink => { const headlineType = headlineByLevel(notFoundTocLink.level + 2); const heightRange = headlineTypeToHeightRange[headlineType.name]; if (heightRange) { const [pageIndex, lineIndex] = findPageAndLineFromHeadline(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber); if (lineIndex > -1) { const page = parseResult.pages[pageIndex]; page.items[lineIndex].annotation = REMOVED_ANNOTATION; page.items.splice(lineIndex + 1, 0, new LineItem({ ...notFoundTocLink.lineItem, type: headlineType, annotation: ADDED_ANNOTATION, })); foundBySize.push(notFoundTocLink); } } }); lastNotFound = []; } } if (notFoundHeadlines.length > 0) { tocLinks.forEach(tocLink => { if (notFoundHeadlines.includes(tocLink)) { lastNotFound.push(tocLink); } else { rollupLastNotFound(tocLink.pageNumber); fromPage = tocLink.pageNumber; } }); if (lastNotFound.length > 0) { rollupLastNotFound(parseResult.pages.length); } } } const messages = []; messages.push('Detected ' + tocPages.length + ' table of content pages'); if (tocPages.length > 0) { messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange)); messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines); } if (notFoundHeadlines.length > 0) { messages.push('Found TOC headlines (by size): ' + foundBySize.map(tocLink => tocLink.lineItem.text())); messages.push('Missing TOC headlines: ' + notFoundHeadlines.filter(fTocLink => !foundBySize.includes(fTocLink)).map(tocLink => tocLink.lineItem.text() + '=>' + tocLink.pageNumber)); } return new ParseResult({ ...parseResult, globals: { ...parseResult.globals, tocPages: tocPages, headlineTypeToHeightRange: headlineTypeToHeightRange }, messages: messages }); } } //Find out how the TOC page link actualy translates to the page.index function detectPageMappingNumber(pages, tocLinks) { for ( var tocLink of tocLinks ) { const page = findPageWithHeadline(pages, tocLink.lineItem.text()); if (page) { return page.index - tocLink.pageNumber; } } return null; } function findPageWithHeadline(pages, headline) { for ( var page of pages ) { if (findHeadlineItems(page, headline)) { return page; } } return null; } function findHeadlineItems(page, headline) { const headlineFinder = new HeadlineFinder({ headline: headline }); var lineIndex = 0; for ( var line of page.items ) { const headlineItems = headlineFinder.consume(line); if (headlineItems) { return { lineIndex: lineIndex, headlineItems: headlineItems }; } lineIndex++; } return null; } function addHeadlineItems(page, tocLink, foundItems, headlineTypeToHeightRange) { foundItems.headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION); const headlineType = headlineByLevel(tocLink.level + 2); const headlineHeight = foundItems.headlineItems.reduce((max, item) => Math.max(max, item.height), 0); page.items.splice(foundItems.lineIndex + 1, 0, new LineItem({ ...foundItems.headlineItems[0], words: tocLink.lineItem.words, height: headlineHeight, type: headlineType, annotation: ADDED_ANNOTATION })); var range = headlineTypeToHeightRange[headlineType.name]; if (range) { range.min = Math.min(range.min, headlineHeight); range.max = Math.max(range.max, headlineHeight); } else { range = { min: headlineHeight, max: headlineHeight }; headlineTypeToHeightRange[headlineType.name] = range; } } function findPageAndLineFromHeadline(pages, tocLink, heightRange, fromPage, toPage) { const linkText = tocLink.lineItem.text().toUpperCase(); for (var i = fromPage; i <= toPage; i++) { const page = pages[i - 1]; const lineIndex = page.items.findIndex(line => { if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) { const match = wordMatch(linkText, line.text()); return match >= 0.5; } return false; }); if (lineIndex > -1) return [i - 1, lineIndex]; } return [-1, -1]; } class LinkLeveler { constructor() { this.levelByMethod = null; this.uniqueFonts = []; } levelPageItems(tocLinks:TocLink[]) { if (!this.levelByMethod) { const uniqueX = this.calculateUniqueX(tocLinks); if (uniqueX.length > 1) { this.levelByMethod = this.levelByXDiff; } else { const uniqueFonts = this.calculateUniqueFonts(tocLinks); if (uniqueFonts.length > 1) { this.uniqueFonts = uniqueFonts; this.levelByMethod = this.levelByFont; } else { this.levelByMethod = this.levelToZero; } } } this.levelByMethod(tocLinks); } levelByXDiff(tocLinks) { const uniqueX = this.calculateUniqueX(tocLinks); tocLinks.forEach(link => { link.level = uniqueX.indexOf(link.lineItem.x); }); } levelByFont(tocLinks) { tocLinks.forEach(link => { link.level = this.uniqueFonts.indexOf(link.lineItem.font); }); } levelToZero(tocLinks) { tocLinks.forEach(link => { link.level = 0; }); } calculateUniqueX(tocLinks) { var uniqueX = tocLinks.reduce(function(uniquesArray, link) { if (uniquesArray.indexOf(link.lineItem.x) < 0) uniquesArray.push(link.lineItem.x); return uniquesArray; }, []); uniqueX.sort((a, b) => { return a - b }); return uniqueX; } calculateUniqueFonts(tocLinks) { var uniqueFont = tocLinks.reduce(function(uniquesArray, link) { if (uniquesArray.indexOf(link.lineItem.font) < 0) uniquesArray.push(link.lineItem.font); return uniquesArray; }, []); return uniqueFont; } } class TocLink { constructor(options) { this.lineItem = options.lineItem; this.pageNumber = options.pageNumber; this.level = 0; } }