diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx index 12d7b79..f8ed8f3 100644 --- a/src/javascript/components/debug/TextItemTable.jsx +++ b/src/javascript/components/debug/TextItemTable.jsx @@ -49,6 +49,9 @@ export default class TextItemTable extends React.Component {
{ textItem.annotation ? textItem.annotation.category : '' }
+
+ { textItem.type ? textItem.type : '' } +
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' } diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index 36c227a..3f91d19 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -1,13 +1,13 @@ import { Enum } from 'enumify'; import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx'; +import CompactLines from './transformations/CompactLines.jsx'; import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx' import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx'; -import CompactLines from './transformations/CompactLines.jsx'; +import DetectTOC from './transformations/DetectTOC.jsx' import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx' import DetectFootnotes from './transformations/DetectFootnotes.jsx' -import DetectTOC from './transformations/DetectTOC.jsx' import DetectLists from './transformations/DetectLists.jsx' import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx' import DetectHeadlines from './transformations/DetectHeadlines.jsx' @@ -34,10 +34,10 @@ export default class AppState { new CompactLines(), new RemoveRepetitiveElements(), new VerticalToHorizontal(), + new DetectTOC(), new DetectPdfBlocks(), new DetectFootnotes(), - new DetectTOC(), new DetectLists(), new DetectCodeBlocks(), new DetectHeadlines(), diff --git a/src/javascript/models/transformations/DetectTOC.jsx b/src/javascript/models/transformations/DetectTOC.jsx index b39f17a..edfe42f 100644 --- a/src/javascript/models/transformations/DetectTOC.jsx +++ b/src/javascript/models/transformations/DetectTOC.jsx @@ -1,106 +1,97 @@ -import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx'; +import ToTextItemTransformation from './ToTextItemTransformation.jsx'; import ParseResult from '../ParseResult.jsx'; import TextItem from '../TextItem.jsx'; -import TextItemBlock from '../TextItemBlock.jsx'; -import TextItemCombiner from '../TextItemCombiner.jsx'; import HeadlineFinder from '../HeadlineFinder.jsx'; import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx'; import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx'; import { isDigit } from '../../functions.jsx' //Detect table of contents pages -export default class DetectTOC extends ToTextItemBlockTransformation { +export default class DetectTOC extends ToTextItemTransformation { constructor() { super("Detect TOC"); } transform(parseResult:ParseResult) { - const {mostUsedDistance} = parseResult.globals; const tocPages = []; const maxPagesToEvaluate = Math.min(20, parseResult.pages.length); - const textCombiner = new TextItemCombiner({ - mostUsedDistance: mostUsedDistance - }); const linkLeveler = new LinkLeveler(); var tocLinks = []; var lastTocPage; + var headlineItem; parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => { - var linesCount = 0; - var linesWithDigitsCount = 0; - var lineItemsWithDigits = []; - const unknownBlocks = new Set(); - var headlineBlock; + const lineItemsWithDigits = []; + const unknownLines = new Set(); const pageTocLinks = []; - page.items.forEach(block => { - var blockHasLinesWithDigits = false; - const itemsGroupedByY = textCombiner.combine(block.textItems).textItems; - var lastLineTextWithoutNumber; - itemsGroupedByY.forEach(lineItem => { - linesCount++ - var lineText = lineItem.text.replace(/\./g, '').trim(); - var endsWithDigit = false; - var digits = []; - while (isDigit(lineText.charCodeAt(lineText.length - 1))) { - digits.unshift(lineText.charAt(lineText.length - 1)); - lineText = lineText.substring(0, lineText.length - 1); - endsWithDigit = true; + var lastLineTextWithoutNumber; + var lastLine; + page.items.forEach(line => { + var lineText = line.text.replace(/\./g, '').trim(); + var endsWithDigit = false; + var digits = []; + while (isDigit(lineText.charCodeAt(lineText.length - 1))) { + digits.unshift(lineText.charAt(lineText.length - 1)); + lineText = lineText.substring(0, lineText.length - 1); + endsWithDigit = true; + } + lineText = lineText.trim(); + if (endsWithDigit) { + endsWithDigit = true; + if (lastLineTextWithoutNumber) { // 2-line item ? + lineText = lastLineTextWithoutNumber + ' ' + lineText; + lastLineTextWithoutNumber = null; } - lineText = lineText.trim(); - if (endsWithDigit) { - if (lastLineTextWithoutNumber) { // 2-line item ? - lineText = lastLineTextWithoutNumber + ' ' + lineText; - lastLineTextWithoutNumber = null; - } - linesWithDigitsCount++; - blockHasLinesWithDigits = true; - pageTocLinks.push(new TocLink({ - pageNumber: parseInt(digits.join('')), - textItem: new TextItem({ - ...lineItem, - text: lineText - }) - })); - lineItemsWithDigits.push(new TextItem({ - ...lineItem, + pageTocLinks.push(new TocLink({ + pageNumber: parseInt(digits.join('')), + textItem: new TextItem({ + ...line, text: lineText - })); + }) + })); + lineItemsWithDigits.push(new TextItem({ + ...line, + text: lineText + })); + lastLineTextWithoutNumber = null; + } else { + if (!headlineItem) { + headlineItem = line; } else { + if (lastLineTextWithoutNumber) { + unknownLines.add(lastLine); + } lastLineTextWithoutNumber = lineText; - } - }); - if (!blockHasLinesWithDigits) { - if (!headlineBlock) { - headlineBlock = block; - } else { - unknownBlocks.add(block); + lastLine = line; } } }); // page has been processed - if (linesWithDigitsCount * 100 / linesCount > 75) { + if (lineItemsWithDigits.length * 100 / page.items.length > 75) { tocPages.push(page.index + 1); lastTocPage = page; linkLeveler.levelPageItems(pageTocLinks); tocLinks = tocLinks.concat(pageTocLinks); const newBlocks = []; - page.items.forEach((block) => { - if (!unknownBlocks.has(block)) { - block.annotation = REMOVED_ANNOTATION; + page.items.forEach((line) => { + if (!unknownLines.has(line)) { + line.annotation = REMOVED_ANNOTATION; } - newBlocks.push(block); - if (block === headlineBlock) { - newBlocks.push(new TextItemBlock({ - textItems: textCombiner.combine(block.textItems).textItems, + newBlocks.push(line); + if (line === headlineItem) { + newBlocks.push(new TextItem({ + ...line, type: HEADLINE2, annotation: ADDED_ANNOTATION })); } }); page.items = newBlocks; + } else { + headlineItem = null; } }); @@ -112,11 +103,11 @@ export default class DetectTOC extends ToTextItemBlockTransformation { var linkedPage = parseResult.pages[tocLink.pageNumber - 1]; var foundHeadline = false; if (linkedPage) { - foundHeadline = findHeadline(linkedPage, tocLink, textCombiner); + foundHeadline = findHeadline(linkedPage, tocLink); if (!foundHeadline) { // pages are off by 1 ? linkedPage = parseResult.pages[tocLink.pageNumber]; if (linkedPage) { - foundHeadline = findHeadline(linkedPage, tocLink, textCombiner); + foundHeadline = findHeadline(linkedPage, tocLink); } } } else { @@ -126,14 +117,13 @@ export default class DetectTOC extends ToTextItemBlockTransformation { notFoundHeadlines.push(tocLink); } }); - lastTocPage.items.push(new TextItemBlock({ - textItems: tocLinks.map(tocLink => { - tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text; - return tocLink.textItem - }), - type: TOC_BLOCK, - annotation: ADDED_ANNOTATION - })); + tocLinks.forEach(tocLink => { + lastTocPage.items.push(new TextItem({ + text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text, + type: TOC_BLOCK, + annotation: ADDED_ANNOTATION + })); + }); } const messages = []; @@ -157,37 +147,25 @@ export default class DetectTOC extends ToTextItemBlockTransformation { } -function findHeadline(page, tocLink, textCombiner) { +function findHeadline(page, tocLink) { const headline = tocLink.textItem.text; const headlineFinder = new HeadlineFinder({ headline: headline }); - var blockIndex = 0; - var lastBlock; - for ( var block of page.items ) { - const itemsGroupedByY = textCombiner.combine(block.textItems).textItems; - for ( var item of itemsGroupedByY ) { - const headlineItems = headlineFinder.consume(item); - if (headlineItems) { - const usedItems = headlineFinder.stackedTextItems; - block.annotation = REMOVED_ANNOTATION; - if (usedItems.length > itemsGroupedByY.length) { - // 2 line headline - lastBlock.annotation = REMOVED_ANNOTATION; - } - page.items.splice(blockIndex + 1, 0, new TextItemBlock({ - textItems: [new TextItem({ - ...usedItems[0], - text: headline - })], - type: headlineByLevel(tocLink.level + 2), - annotation: ADDED_ANNOTATION - })); - return true; - } + var lineIndex = 0; + for ( var line of page.items ) { + const headlineItems = headlineFinder.consume(line); + if (headlineItems) { + headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION); + page.items.splice(lineIndex + 1, 0, new TextItem({ + ...headlineItems[0], + text: headline, + type: headlineByLevel(tocLink.level + 2), + annotation: ADDED_ANNOTATION + })); + return true; } - blockIndex++; - lastBlock = block; + lineIndex++; } return false; }