[WIP] detect TOC on text items, not on blocks

2025-06-26 12:31:52 +02:00 · 2017-03-10 09:52:29 +01:00 · 2017-03-10 09:52:29 +01:00 · bd4c207ae3
commit bd4c207ae3
parent e2481bdd2a
3 changed files with 79 additions and 98 deletions
--- a/src/javascript/components/debug/TextItemTable.jsx
+++ b/src/javascript/components/debug/TextItemTable.jsx
@ -49,6 +49,9 @@ export default class TextItemTable extends React.Component {
                                                                <div style={ { textAlign: 'center' } }>
                                                                  { textItem.annotation ? textItem.annotation.category : '' }
                                                                </div>
+                                                                <div style={ { textAlign: 'center', color: 'brown' } }>
+                                                                  { textItem.type ? textItem.type : '' }
+                                                                </div>
                                                                <div style={ { textAlign: 'center', color: 'orange' } }>
                                                                  { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
                                                                  { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -1,13 +1,13 @@
 import { Enum } from 'enumify';

 import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
+import CompactLines from './transformations/CompactLines.jsx';
 import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
 import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
-import CompactLines from './transformations/CompactLines.jsx';
+import DetectTOC from './transformations/DetectTOC.jsx'

 import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
 import DetectFootnotes from './transformations/DetectFootnotes.jsx'
-import DetectTOC from './transformations/DetectTOC.jsx'
 import DetectLists from './transformations/DetectLists.jsx'
 import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
 import DetectHeadlines from './transformations/DetectHeadlines.jsx'
@ -34,10 +34,10 @@ export default class AppState {
            new CompactLines(),
            new RemoveRepetitiveElements(),
            new VerticalToHorizontal(),
+            new DetectTOC(),

            new DetectPdfBlocks(),
            new DetectFootnotes(),
-            new DetectTOC(),
            new DetectLists(),
            new DetectCodeBlocks(),
            new DetectHeadlines(),
--- a/src/javascript/models/transformations/DetectTOC.jsx
+++ b/src/javascript/models/transformations/DetectTOC.jsx
@ -1,45 +1,34 @@
-import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
+import ToTextItemTransformation from './ToTextItemTransformation.jsx';
 import ParseResult from '../ParseResult.jsx';
 import TextItem from '../TextItem.jsx';
-import TextItemBlock from '../TextItemBlock.jsx';
-import TextItemCombiner from '../TextItemCombiner.jsx';
 import HeadlineFinder from '../HeadlineFinder.jsx';
 import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
 import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
 import { isDigit } from '../../functions.jsx'

 //Detect table of contents pages
-export default class DetectTOC extends ToTextItemBlockTransformation {
+export default class DetectTOC extends ToTextItemTransformation {

    constructor() {
        super("Detect TOC");
    }

    transform(parseResult:ParseResult) {
-        const {mostUsedDistance} = parseResult.globals;
        const tocPages = [];
        const maxPagesToEvaluate = Math.min(20, parseResult.pages.length);
-        const textCombiner = new TextItemCombiner({
-            mostUsedDistance: mostUsedDistance
-        });

        const linkLeveler = new LinkLeveler();
        var tocLinks = [];
        var lastTocPage;
+        var headlineItem;
        parseResult.pages.slice(0, maxPagesToEvaluate).forEach(page => {
-            var linesCount = 0;
-            var linesWithDigitsCount = 0;
-            var lineItemsWithDigits = [];
-            const unknownBlocks = new Set();
-            var headlineBlock;
+            const lineItemsWithDigits = [];
+            const unknownLines = new Set();
            const pageTocLinks = [];
-            page.items.forEach(block => {
-                var blockHasLinesWithDigits = false;
-                const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
            var lastLineTextWithoutNumber;
-                itemsGroupedByY.forEach(lineItem => {
-                    linesCount++
-                    var lineText = lineItem.text.replace(/\./g, '').trim();
+            var lastLine;
+            page.items.forEach(line => {
+                var lineText = line.text.replace(/\./g, '').trim();
                var endsWithDigit = false;
                var digits = [];
                while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
@ -49,58 +38,60 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
                }
                lineText = lineText.trim();
                if (endsWithDigit) {
+                    endsWithDigit = true;
                    if (lastLineTextWithoutNumber) { // 2-line item ?
                        lineText = lastLineTextWithoutNumber + ' ' + lineText;
                        lastLineTextWithoutNumber = null;
                    }
-                        linesWithDigitsCount++;
-                        blockHasLinesWithDigits = true;
                    pageTocLinks.push(new TocLink({
                        pageNumber: parseInt(digits.join('')),
                        textItem: new TextItem({
-                                ...lineItem,
+                            ...line,
                            text: lineText
                        })
                    }));
                    lineItemsWithDigits.push(new TextItem({
-                            ...lineItem,
+                        ...line,
                        text: lineText
                    }));
+                    lastLineTextWithoutNumber = null;
                } else {
-                        lastLineTextWithoutNumber = lineText;
+                    if (!headlineItem) {
+                        headlineItem = line;
+                    } else {
+                        if (lastLineTextWithoutNumber) {
+                            unknownLines.add(lastLine);
                        }
-                });
-                if (!blockHasLinesWithDigits) {
-                    if (!headlineBlock) {
-                        headlineBlock = block;
-                    } else {
-                        unknownBlocks.add(block);
+                        lastLineTextWithoutNumber = lineText;
+                        lastLine = line;
                    }
                }
            });

            // page has been processed
-            if (linesWithDigitsCount * 100 / linesCount > 75) {
+            if (lineItemsWithDigits.length * 100 / page.items.length > 75) {
                tocPages.push(page.index + 1);
                lastTocPage = page;
                linkLeveler.levelPageItems(pageTocLinks);
                tocLinks = tocLinks.concat(pageTocLinks);

                const newBlocks = [];
-                page.items.forEach((block) => {
-                    if (!unknownBlocks.has(block)) {
-                        block.annotation = REMOVED_ANNOTATION;
+                page.items.forEach((line) => {
+                    if (!unknownLines.has(line)) {
+                        line.annotation = REMOVED_ANNOTATION;
                    }
-                    newBlocks.push(block);
-                    if (block === headlineBlock) {
-                        newBlocks.push(new TextItemBlock({
-                            textItems: textCombiner.combine(block.textItems).textItems,
+                    newBlocks.push(line);
+                    if (line === headlineItem) {
+                        newBlocks.push(new TextItem({
+                            ...line,
                            type: HEADLINE2,
                            annotation: ADDED_ANNOTATION
                        }));
                    }
                });
                page.items = newBlocks;
+            } else {
+                headlineItem = null;
            }
        });

@ -112,11 +103,11 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
                var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
                var foundHeadline = false;
                if (linkedPage) {
-                    foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
+                    foundHeadline = findHeadline(linkedPage, tocLink);
                    if (!foundHeadline) { // pages are off by 1 ?
                        linkedPage = parseResult.pages[tocLink.pageNumber];
                        if (linkedPage) {
-                            foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
+                            foundHeadline = findHeadline(linkedPage, tocLink);
                        }
                    }
                } else {
@ -126,14 +117,13 @@ export default class DetectTOC extends ToTextItemBlockTransformation {
                    notFoundHeadlines.push(tocLink);
                }
            });
-            lastTocPage.items.push(new TextItemBlock({
-                textItems: tocLinks.map(tocLink => {
-                    tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
-                    return tocLink.textItem
-                }),
+            tocLinks.forEach(tocLink => {
+                lastTocPage.items.push(new TextItem({
+                    text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
                    type: TOC_BLOCK,
                    annotation: ADDED_ANNOTATION
                }));
+            });
        }

        const messages = [];
@ -157,37 +147,25 @@ export default class DetectTOC extends ToTextItemBlockTransformation {

 }

-function findHeadline(page, tocLink, textCombiner) {
+function findHeadline(page, tocLink) {
    const headline = tocLink.textItem.text;
    const headlineFinder = new HeadlineFinder({
        headline: headline
    });
-    var blockIndex = 0;
-    var lastBlock;
-    for ( var block of page.items ) {
-        const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
-        for ( var item of itemsGroupedByY ) {
-            const headlineItems = headlineFinder.consume(item);
+    var lineIndex = 0;
+    for ( var line of page.items ) {
+        const headlineItems = headlineFinder.consume(line);
        if (headlineItems) {
-                const usedItems = headlineFinder.stackedTextItems;
-                block.annotation = REMOVED_ANNOTATION;
-                if (usedItems.length > itemsGroupedByY.length) {
-                    // 2 line headline
-                    lastBlock.annotation = REMOVED_ANNOTATION;
-                }
-                page.items.splice(blockIndex + 1, 0, new TextItemBlock({
-                    textItems: [new TextItem({
-                        ...usedItems[0],
-                        text: headline
-                    })],
+            headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
+            page.items.splice(lineIndex + 1, 0, new TextItem({
+                ...headlineItems[0],
+                text: headline,
                type: headlineByLevel(tocLink.level + 2),
                annotation: ADDED_ANNOTATION
            }));
            return true;
        }
-        }
-        blockIndex++;
-        lastBlock = block;
+        lineIndex++;
    }
    return false;
 }