[WIP] find not found TOC-Headers by size

2025-06-20 17:47:47 +02:00 · 2017-03-15 08:42:46 +01:00 · 2017-03-15 08:42:46 +01:00 · dbd9d8bf5f
commit dbd9d8bf5f
parent 93f15a38b5
7 changed files with 205 additions and 46 deletions
--- a/src/javascript/functions.jsx
+++ b/src/javascript/functions.jsx
@ -62,3 +62,11 @@ export function isListItem(string) {
 export function isNumberedListItem(string) {
    return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
 }
 export function wordMatch(string1, string2) {
    const words1 = new Set(string1.toUpperCase().split(' '));
    const words2 = new Set(string2.toUpperCase().split(' '));
    const intersection = new Set(
        [...words1].filter(x => words2.has(x)));
    return intersection.size / Math.max(words1.size, words2.size);
 }
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiv
 import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
 import DetectTOC from './transformations/textitem/DetectTOC.jsx'
 import DetectListItems from './transformations/textitem/DetectListItems.jsx'
-// import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
+import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
 import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
 import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
--- a/src/javascript/models/ElementType.jsx
+++ b/src/javascript/models/ElementType.jsx
@ -69,6 +69,10 @@ ElementType.initEnum({
    }
 });
 export function isHeadline(elementType: ElementType) {
    return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
 }
 export function blockToText(block: TextItemBlock) {
    if (!block.type) {
        return concatTextItems(block.textItems);
--- a/src/javascript/models/HeaderLevelAssigner.jsx
+++ b/src/javascript/models/HeaderLevelAssigner.jsx
@ -1,28 +0,0 @@
 // Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
 // Levels are from 1..6, where 1 is the biggest headline.
 // HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
 export default class HeaderLevelAssigner {
    constructor(options) {
        this.startLevel = options.startLevel;
        this.paragraphHeight = options.paragraphHeight;
        this.lastLevel = null;
        this.lastHeight = null;
        this.heightToLevel = {};
    }
    add(height) {
        if (!this.lastHeight) {
            this.lastLevel = this.startLevel;
            this.heightToLevel[height] = this.startLevel;
        } else {
            const existingLevel = this.heightToLevel[height];
            if (!existingLevel) {
                //
            }
        }
        this.lastHeight = height;
    }
 }
--- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx
+++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx
@ -0,0 +1,86 @@
 import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
 import ParseResult from '../../ParseResult.jsx';
 import TextItem from '../../TextItem.jsx';
 import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
 import ElementType from '../../ElementType.jsx';
 import { isHeadline, headlineByLevel } from '../../ElementType.jsx';
 import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
 //Detect items starting with -, •, etc...
 export default class DetectListItems extends ToTextItemTransformation {
    constructor() {
        super("Detect Headers");
    }
    transform(parseResult:ParseResult) {
        // analyse existing headers from TOC detection
        const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
        parseResult.pages.forEach(page => {
            page.items.forEach(textItem => {
                if (isHeadline(textItem.type)) {
                    var range = headlineTypeToHeightRange[textItem.type];
                    if (range) {
                        range.min = Math.min(range.min, textItem.height);
                        range.max = Math.max(range.max, textItem.height);
                    } else {
                        range = {
                            min: textItem.height,
                            max: textItem.height
                        };
                        headlineTypeToHeightRange[textItem.type] = range;
                    }
                }
            });
        });
        const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
        if (existingHeadlineTypes.length > 0) {
        }
        var foundListItems = 0;
        var foundNumberedItems = 0;
        // parseResult.pages.forEach(page => {
        //     const newTextItems = [];
        //     page.items.forEach(textItem => {
        //         newTextItems.push(textItem);
        //         if (!textItem.type) {
        //             var text = textItem.text;
        //             if (isListItem(text)) {
        //                 foundListItems++
        //                 const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
        //                 if (textWithDash === text) {
        //                     textItem.annotation = DETECTED_ANNOTATION;
        //                     textItem.type = ElementType.LIST;
        //                 } else {
        //                     textItem.annotation = REMOVED_ANNOTATION;
        //                     newTextItems.push(new TextItem({
        //                         ...textItem,
        //                         text: textWithDash,
        //                         annotation: ADDED_ANNOTATION,
        //                         type: ElementType.LIST
        //                     }));
        //                 }
        //             } else if (isNumberedListItem(text)) {
        //                 foundNumberedItems++;
        //                 textItem.annotation = DETECTED_ANNOTATION;
        //                 textItem.type = ElementType.LIST;
        //             }
        //         }
        //     });
        //     page.items = newTextItems;
        // });
        return new ParseResult({
            ...parseResult,
            messages: [
                'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
                'Detected ' + foundNumberedItems + ' numbered list items.'
            ]
        });
    }
 }
--- a/src/javascript/models/transformations/textitem/DetectTOC.jsx
+++ b/src/javascript/models/transformations/textitem/DetectTOC.jsx
@ -2,10 +2,10 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
 import ParseResult from '../../ParseResult.jsx';
 import TextItem from '../../TextItem.jsx';
 import HeadlineFinder from '../../HeadlineFinder.jsx';
-import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
 import ElementType from '../../ElementType.jsx';
 import { headlineByLevel } from '../../ElementType.jsx';
-import { isDigit } from '../../../functions.jsx'
+import { isDigit, wordMatch } from '../../../functions.jsx'
 //Detect table of contents pages
 export default class DetectTOC extends ToTextItemTransformation {
@ -99,16 +99,29 @@ export default class DetectTOC extends ToTextItemTransformation {
        //all  pages have been processed
        var foundHeadlines = tocLinks.length;
        const notFoundHeadlines = [];
        const foundBySize = [];
        const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
        if (tocPages.length > 0) {
            // Add TOC items
            tocLinks.forEach(tocLink => {
                lastTocPage.items.push(new TextItem({
                    text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
                    type: ElementType.TOC,
                    annotation: ADDED_ANNOTATION
                }));
            });
            // Add linked headers
            tocLinks.forEach(tocLink => {
                var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
                var foundHeadline = false;
                if (linkedPage) {
-                    foundHeadline = findHeadline(linkedPage, tocLink);
+                    foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
                    if (!foundHeadline) { // pages are off by 1 ?
                        linkedPage = parseResult.pages[tocLink.pageNumber];
                        if (linkedPage) {
-                            foundHeadline = findHeadline(linkedPage, tocLink);
+                            foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
                        }
                    }
                } else {
@ -118,22 +131,53 @@ export default class DetectTOC extends ToTextItemTransformation {
                    notFoundHeadlines.push(tocLink);
                }
            });
-            tocLinks.forEach(tocLink => {
+
-                lastTocPage.items.push(new TextItem({
+            // Try to find linked headers by height
-                    text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
+            var fromPage = lastTocPage.index + 2;
-                    type: ElementType.TOC,
+            var lastNotFound = [];
-                    annotation: ADDED_ANNOTATION
+            const rollupLastNotFound = (currentPageNumber) => {
-                }));
+                if (lastNotFound.length > 0) {
-            });
+                    lastNotFound.forEach(notFoundTocLink => {
                        const headlineType = headlineByLevel(notFoundTocLink.level + 2);
                        const heightRange = headlineTypeToHeightRange[headlineType];
                        if (heightRange) {
                            const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
                            if (textItem) {
                                textItem.type = headlineType;
                                textItem.annotation = DETECTED_ANNOTATION;
                                foundBySize.push(textItem.text);
                            }
                        }
                    });
                    lastNotFound = [];
                }
            }
            if (notFoundHeadlines.length > 0) {
                tocLinks.forEach(tocLink => {
                    if (notFoundHeadlines.includes(tocLink)) {
                        lastNotFound.push(tocLink);
                    } else {
                        rollupLastNotFound(tocLink.pageNumber);
                        fromPage = tocLink.pageNumber;
                    }
                });
                if (lastNotFound.length > 0) {
                    rollupLastNotFound(parseResult.pages.length);
                }
            }
        }
        const messages = [];
        messages.push('Detected ' + tocPages.length + ' table of content pages');
-        if (foundHeadlines > 0) {
+        if (tocPages.length > 0) {
-            messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
+            messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
            messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
        }
        if (notFoundHeadlines.length > 0) {
-            messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
+            messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
            messages.push('Found TOC headlines (by size): ' + foundBySize);
        }
        return new ParseResult({
            ...parseResult,
@ -148,7 +192,7 @@ export default class DetectTOC extends ToTextItemTransformation {
 }
-function findHeadline(page, tocLink) {
+function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
    const headline = tocLink.textItem.text;
    const headlineFinder = new HeadlineFinder({
        headline: headline
@ -158,12 +202,26 @@ function findHeadline(page, tocLink) {
        const headlineItems = headlineFinder.consume(line);
        if (headlineItems) {
            headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
            const headlineType = headlineByLevel(tocLink.level + 2);
            const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
            page.items.splice(lineIndex + 1, 0, new TextItem({
                ...headlineItems[0],
                text: headline,
-                type: headlineByLevel(tocLink.level + 2),
+                height: headlineHeight,
                type: headlineType,
                annotation: ADDED_ANNOTATION
            }));
            var range = headlineTypeToHeightRange[headlineType];
            if (range) {
                range.min = Math.min(range.min, headlineHeight);
                range.max = Math.max(range.max, headlineHeight);
            } else {
                range = {
                    min: headlineHeight,
                    max: headlineHeight
                };
                headlineTypeToHeightRange[headlineType] = range;
            }
            return true;
        }
        lineIndex++;
@ -171,6 +229,20 @@ function findHeadline(page, tocLink) {
    return false;
 }
 function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
    for (var i = fromPage; i <= toPage; i++) {
        const page = pages[i - 1];
        for ( var line of page.items ) {
            if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
                const match = wordMatch(tocLink.textItem.text, line.text);
                if (match >= 0.5) {
                    return line;
                }
            }
        }
    }
 }
 class LinkLeveler {
    constructor() {
--- a/test/functions.spec.js
+++ b/test/functions.spec.js
@ -1,6 +1,6 @@
 import { expect } from 'chai';
-import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'
+import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'
 describe('hasUpperCaseCharacterInMiddleOfWord', () => {
@ -135,3 +135,20 @@ describe('isNumberedListItem', () => {
    });
 });
 describe('wordsMatch', () => {
    it('Match', () => {
        expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
        expect(wordMatch('text 1', 'text 2')).to.equal(0.5);
        expect(wordMatch('text 1', 'text 1 2')).to.equal(0.6666666666666666);
        expect(wordMatch('text 1 2 3', 'text 1 4 5')).to.equal(0.5);
        expect(wordMatch('text 1 2 3', '5 1 4 text')).to.equal(0.5);
        expect(wordMatch('text 1 2 3', 'text')).to.equal(0.25);
        expect(wordMatch('text', 'test')).to.equal(0.0);
        expect(wordMatch('inStruCtionS for the full Moon proCeSS', 'Instructions for the Full Moon Process')).to.equal(1.0);
    });
 });