WIP initial TOC detection

2025-08-19 09:51:36 +02:00 · 2017-02-19 10:20:14 +01:00
parent bed3fd357b
commit 2783d724e5
5 changed files with 129 additions and 27 deletions
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/CalculateGlobalStats.jsx';
 import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
 import VerticalToHorizontal from './transformations/VerticalToHorizontal.jsx';
 import DetectPdfBlocks from './transformations/DetectPdfBlocks.jsx'
 import DetectTOC from './transformations/DetectTOC.jsx'
 import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
 import DetectFormats from './transformations/DetectFormats.jsx'
 import CombineSameY from './transformations/CombineSameY.jsx';
@@ -29,6 +30,7 @@ export default class AppState {
            new RemoveRepetitiveElements(),
            new VerticalToHorizontal(),
            new DetectPdfBlocks(),
            new DetectTOC(),
            new DetectCodeBlocks(),
            // new DetectFormats(),
            // new CombineSameY(),
--- a/src/javascript/models/MarkdownElements.jsx
+++ b/src/javascript/models/MarkdownElements.jsx
@@ -1,17 +1,25 @@
 import PdfBlock from './BlockPage.jsx';
 export const CODE_BLOCK = "Code/Quote";
 export const HEADLINE1 = "Headline 1";
 export const CODE_BLOCK = "Code/Quote";
 export const TOC_BLOCK = "TOC";
 export function blockToText(block: PdfBlock) {
    const text = concatTextItems(block);
    switch (block.type) {
    case CODE_BLOCK:
-        return '```\n' + text + '```'
+        return '```\n' + concatTextItems(block) + '```'
-    case HEADLINE1:
+    case TOC_BLOCK:
-        return '#' + text;
+        //TODO 2nd level
-    default:
+        //TODO real links
        var text = '';
        block.textItems.forEach(item => {
            text += '- ' + item.text + '\n';
        });
        return text;
    case HEADLINE1:
        return '#' + concatTextItems(block);
    default:
        return concatTextItems(block);
    }
 }
--- a/src/javascript/models/TextItemCombiner.jsx
+++ b/src/javascript/models/TextItemCombiner.jsx
@@ -6,26 +6,32 @@ export default class TextItemCombiner {
    constructor(options) {
        this.transformEmphasis = options.transformEmphasis || true;
-        console.debug(this.transformEmphasis);
+        this.maxYDerivation = options.transformEmphasis || 3;
    }
    // returns a TextItem array new items
    combine(textItems: TextItem[]) {
        const resultItems = [];
-        const groupedItems = groupByFollowingY(textItems);
+        const groupedItems = this.groupByFollowingY(textItems);
        groupedItems.forEach(itemGroup => {
            if (itemGroup.length == 1) {
                resultItems.push(itemGroup[0]);
            } else {
                var text = '';
                var maxHeight = 0;
                var widthSum = 0;
                itemGroup.forEach(item => {
                    // item.annotation = REMOVED_ANNOTATION;
                    // resultItems.push(item);
                    text += item.text;
                    widthSum += item.width;
                });
                //TODO set other elements
                resultItems.push(new TextItem({
                    ...itemGroup[0],
                    text: text,
                    height: maxHeight,
                    width: widthSum,
                }));
            }
        });
@@ -36,21 +42,20 @@ export default class TextItemCombiner {
        return resultItems;
    }
-}
+    groupByFollowingY(textItems) {
-
+        const yArrays = [];
-function groupByFollowingY(textItems) {
+        var itemsWithSameY = [];
-    const yArrays = [];
+        var lastItem;
-    var itemsWithSameY = [];
+        textItems.forEach(item => {
-    var lastItem;
+            if (itemsWithSameY.length == 0 || Math.abs(lastItem.y - item.y) <= this.maxYDerivation) {
-    textItems.forEach(item => {
+                itemsWithSameY.push(item);
-        if (itemsWithSameY.length == 0 || item.y == lastItem.y) {
+            } else {
-            itemsWithSameY.push(item);
+                yArrays.push(itemsWithSameY);
-        } else {
+                itemsWithSameY = [item];
-            yArrays.push(itemsWithSameY);
+            }
-            itemsWithSameY = [item];
+            lastItem = item;
-        }
+        })
-        lastItem = item;
+        yArrays.push(itemsWithSameY);
-    })
+        return yArrays;
-    yArrays.push(itemsWithSameY);
+    }
    return yArrays;
 }
--- a/src/javascript/models/transformations/DetectCodeBlocks.jsx
+++ b/src/javascript/models/transformations/DetectCodeBlocks.jsx
@@ -20,8 +20,6 @@ export default class DetectCodeBlocks extends ToPdfBlockViewTransformation {
               </div>;
    }
    // TODO ==> combine quotes follow each other
    transform(parseResult:ParseResult) {
        const {mostUsedHeight, mostUsedDistance} = parseResult.globals;
--- a/src/javascript/models/transformations/DetectTOC.jsx
+++ b/src/javascript/models/transformations/DetectTOC.jsx
@@ -0,0 +1,89 @@
 import React from 'react';
 import ToPdfBlockViewTransformation from './ToPdfBlockViewTransformation.jsx';
 import ParseResult from '../ParseResult.jsx';
 import TextItem from '../TextItem.jsx';
 import PdfBlock from '../PdfBlock.jsx';
 import TextItemCombiner from '../TextItemCombiner.jsx';
 import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
 import { TOC_BLOCK } from '../MarkdownElements.jsx';
 import Annotation from '../Annotation.jsx';
 import { groupByFollowingY } from '../TextItemCombiner.jsx';
 import { isNumber, isDigit } from '../../functions.jsx'
 //Detect table of contents pages
 export default class DetectTOC extends ToPdfBlockViewTransformation {
    constructor() {
        super("Detect Table of Contents");
    }
    createSummaryView(parseResult:ParseResult) {
        return <div>
                 Detected
                 { ' ' + parseResult.summary.foundTocPages + ' ' } table of content pages.
               </div>;
    }
    transform(parseResult:ParseResult) {
        var foundTocPages = 0;
        var x = Math.min(12, parseResult.content.length);
        const textCombiner = new TextItemCombiner({});
        parseResult.content.slice(0, x).forEach(page => {
            var linesCount = 0;
            var linesWithDigitsCount = 0;
            var lineItemsWithDigits = [];
            var headlineBlock;
            page.blocks.forEach(block => {
                var blockHasLinesWithDigits = false;
                const itemsGroupedByY = textCombiner.combine(block.textItems);
                itemsGroupedByY.forEach(lineItem => {
                    linesCount++
                    var lineText = lineItem.text.replace(/\./g, '').trim();
                    var endsWithDigit = false;
                    while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
                        lineText = lineText.substring(0, lineText.length - 2);
                        endsWithDigit = true;
                    }
                    lineText = lineText.trim();
                    if (endsWithDigit) {
                        linesWithDigitsCount++;
                        blockHasLinesWithDigits = true;
                        lineItemsWithDigits.push(new TextItem({
                            ...lineItem,
                            text: lineText
                        }));
                    }
                });
                if (!blockHasLinesWithDigits) {
                    if (!headlineBlock) {
                        headlineBlock = block;
                    }
                }
            });
            if (linesWithDigitsCount * 100 / linesCount > 75) {
                foundTocPages++;
                page.blocks.forEach(block => {
                    if (block !== headlineBlock) {
                        block.annotation = REMOVED_ANNOTATION;
                    }
                });
                page.blocks.push(new PdfBlock({
                    textItems: lineItemsWithDigits,
                    type: TOC_BLOCK,
                    annotation: ADDED_ANNOTATION
                }));
            }
        });
        return new ParseResult({
            ...parseResult,
            summary: {
                foundTocPages: foundTocPages
            }
        });
    }
 }