[WIP] find not found TOC-Headers by size

2025-06-20 17:47:47 +02:00 · 2017-03-15 08:42:46 +01:00 · 2017-03-15 08:42:46 +01:00 · dbd9d8bf5f
commit dbd9d8bf5f
parent 93f15a38b5
7 changed files with 205 additions and 46 deletions
--- a/src/javascript/functions.jsx
+++ b/src/javascript/functions.jsx
@ -62,3 +62,11 @@ export function isListItem(string) {
 export function isNumberedListItem(string) {
    return /^[\s]*[\d]*[\.][\s].*$/g.test(string);
 }
+
+export function wordMatch(string1, string2) {
+    const words1 = new Set(string1.toUpperCase().split(' '));
+    const words2 = new Set(string2.toUpperCase().split(' '));
+    const intersection = new Set(
+        [...words1].filter(x => words2.has(x)));
+    return intersection.size / Math.max(words1.size, words2.size);
+}
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -6,7 +6,7 @@ import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiv
 import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
 import DetectTOC from './transformations/textitem/DetectTOC.jsx'
 import DetectListItems from './transformations/textitem/DetectListItems.jsx'
-// import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
+import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'

 import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
 import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
--- a/src/javascript/models/ElementType.jsx
+++ b/src/javascript/models/ElementType.jsx
@ -69,6 +69,10 @@ ElementType.initEnum({
    }
 });

+export function isHeadline(elementType: ElementType) {
+    return elementType && elementType.name.length == 2 && elementType.name[0] === 'H'
+}
+
 export function blockToText(block: TextItemBlock) {
    if (!block.type) {
        return concatTextItems(block.textItems);
--- a/src/javascript/models/HeaderLevelAssigner.jsx
+++ b/src/javascript/models/HeaderLevelAssigner.jsx
@ -1,28 +0,0 @@
-
-// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
-// Levels are from 1..6, where 1 is the biggest headline.
-// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
-export default class HeaderLevelAssigner {
-
-    constructor(options) {
-        this.startLevel = options.startLevel;
-        this.paragraphHeight = options.paragraphHeight;
-        this.lastLevel = null;
-        this.lastHeight = null;
-        this.heightToLevel = {};
-    }
-
-    add(height) {
-        if (!this.lastHeight) {
-            this.lastLevel = this.startLevel;
-            this.heightToLevel[height] = this.startLevel;
-        } else {
-            const existingLevel = this.heightToLevel[height];
-            if (!existingLevel) {
-                //
-            }
-        }
-
-        this.lastHeight = height;
-    }
-}
--- a/src/javascript/models/transformations/textitem/DetectHeaders.jsx
+++ b/src/javascript/models/transformations/textitem/DetectHeaders.jsx
@ -0,0 +1,86 @@
+import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ParseResult from '../../ParseResult.jsx';
+import TextItem from '../../TextItem.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
+import ElementType from '../../ElementType.jsx';
+import { isHeadline, headlineByLevel } from '../../ElementType.jsx';
+import { isListItem, isNumberedListItem, removeLeadingWhitespaces } from '../../../functions.jsx';
+
+//Detect items starting with -, •, etc...
+export default class DetectListItems extends ToTextItemTransformation {
+
+    constructor() {
+        super("Detect Headers");
+    }
+
+    transform(parseResult:ParseResult) {
+        // analyse existing headers from TOC detection
+        const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
+        parseResult.pages.forEach(page => {
+            page.items.forEach(textItem => {
+                if (isHeadline(textItem.type)) {
+                    var range = headlineTypeToHeightRange[textItem.type];
+                    if (range) {
+                        range.min = Math.min(range.min, textItem.height);
+                        range.max = Math.max(range.max, textItem.height);
+                    } else {
+                        range = {
+                            min: textItem.height,
+                            max: textItem.height
+                        };
+                        headlineTypeToHeightRange[textItem.type] = range;
+                    }
+                }
+            });
+        });
+
+        const existingHeadlineTypes = Object.keys(headlineTypeToHeightRange);
+        if (existingHeadlineTypes.length > 0) {
+
+        }
+
+
+        var foundListItems = 0;
+        var foundNumberedItems = 0;
+        // parseResult.pages.forEach(page => {
+        //     const newTextItems = [];
+        //     page.items.forEach(textItem => {
+        //         newTextItems.push(textItem);
+        //         if (!textItem.type) {
+        //             var text = textItem.text;
+        //             if (isListItem(text)) {
+        //                 foundListItems++
+        //                 const textWithDash = '-' + removeLeadingWhitespaces(text).substring(1, text.length);
+        //                 if (textWithDash === text) {
+        //                     textItem.annotation = DETECTED_ANNOTATION;
+        //                     textItem.type = ElementType.LIST;
+        //                 } else {
+        //                     textItem.annotation = REMOVED_ANNOTATION;
+        //                     newTextItems.push(new TextItem({
+        //                         ...textItem,
+        //                         text: textWithDash,
+        //                         annotation: ADDED_ANNOTATION,
+        //                         type: ElementType.LIST
+        //                     }));
+        //                 }
+        //             } else if (isNumberedListItem(text)) {
+        //                 foundNumberedItems++;
+        //                 textItem.annotation = DETECTED_ANNOTATION;
+        //                 textItem.type = ElementType.LIST;
+        //             }
+        //         }
+        //     });
+        //     page.items = newTextItems;
+        // });
+
+        return new ParseResult({
+            ...parseResult,
+            messages: [
+                'Existing headline heights: ' + JSON.stringify(headlineTypeToHeightRange),
+                'Detected ' + foundNumberedItems + ' numbered list items.'
+            ]
+        });
+
+    }
+
+}
--- a/src/javascript/models/transformations/textitem/DetectTOC.jsx
+++ b/src/javascript/models/transformations/textitem/DetectTOC.jsx
@ -2,10 +2,10 @@ import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
 import ParseResult from '../../ParseResult.jsx';
 import TextItem from '../../TextItem.jsx';
 import HeadlineFinder from '../../HeadlineFinder.jsx';
-import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION, DETECTED_ANNOTATION } from '../../Annotation.jsx';
 import ElementType from '../../ElementType.jsx';
 import { headlineByLevel } from '../../ElementType.jsx';
-import { isDigit } from '../../../functions.jsx'
+import { isDigit, wordMatch } from '../../../functions.jsx'

 //Detect table of contents pages
 export default class DetectTOC extends ToTextItemTransformation {
@ -99,16 +99,29 @@ export default class DetectTOC extends ToTextItemTransformation {
        //all  pages have been processed
        var foundHeadlines = tocLinks.length;
        const notFoundHeadlines = [];
+        const foundBySize = [];
+        const headlineTypeToHeightRange = {}; //H1={min:23, max:25}
+
        if (tocPages.length > 0) {
+            // Add TOC items
+            tocLinks.forEach(tocLink => {
+                lastTocPage.items.push(new TextItem({
+                    text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
+                    type: ElementType.TOC,
+                    annotation: ADDED_ANNOTATION
+                }));
+            });
+
+            // Add linked headers
            tocLinks.forEach(tocLink => {
                var linkedPage = parseResult.pages[tocLink.pageNumber - 1];
                var foundHeadline = false;
                if (linkedPage) {
-                    foundHeadline = findHeadline(linkedPage, tocLink);
+                    foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
                    if (!foundHeadline) { // pages are off by 1 ?
                        linkedPage = parseResult.pages[tocLink.pageNumber];
                        if (linkedPage) {
-                            foundHeadline = findHeadline(linkedPage, tocLink);
+                            foundHeadline = findAndAddHeadline(linkedPage, tocLink, headlineTypeToHeightRange);
                        }
                    }
                } else {
@ -118,22 +131,53 @@ export default class DetectTOC extends ToTextItemTransformation {
                    notFoundHeadlines.push(tocLink);
                }
            });
-            tocLinks.forEach(tocLink => {
-                lastTocPage.items.push(new TextItem({
-                    text: ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text,
-                    type: ElementType.TOC,
-                    annotation: ADDED_ANNOTATION
-                }));
-            });
+
+            // Try to find linked headers by height
+            var fromPage = lastTocPage.index + 2;
+            var lastNotFound = [];
+            const rollupLastNotFound = (currentPageNumber) => {
+                if (lastNotFound.length > 0) {
+                    lastNotFound.forEach(notFoundTocLink => {
+                        const headlineType = headlineByLevel(notFoundTocLink.level + 2);
+                        const heightRange = headlineTypeToHeightRange[headlineType];
+                        if (heightRange) {
+                            const textItem = findHeadlinesBySize(parseResult.pages, notFoundTocLink, heightRange, fromPage, currentPageNumber);
+                            if (textItem) {
+                                textItem.type = headlineType;
+                                textItem.annotation = DETECTED_ANNOTATION;
+                                foundBySize.push(textItem.text);
+                            }
+                        }
+                    });
+                    lastNotFound = [];
+                }
+            }
+            if (notFoundHeadlines.length > 0) {
+                tocLinks.forEach(tocLink => {
+                    if (notFoundHeadlines.includes(tocLink)) {
+                        lastNotFound.push(tocLink);
+                    } else {
+                        rollupLastNotFound(tocLink.pageNumber);
+                        fromPage = tocLink.pageNumber;
+                    }
+                });
+                if (lastNotFound.length > 0) {
+                    rollupLastNotFound(parseResult.pages.length);
+                }
+            }
        }

+
+
        const messages = [];
        messages.push('Detected ' + tocPages.length + ' table of content pages');
-        if (foundHeadlines > 0) {
-            messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
+        if (tocPages.length > 0) {
+            messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length + foundBySize.length) + '/' + foundHeadlines);
+            messages.push('TOC headline heights: ' + JSON.stringify(headlineTypeToHeightRange));
        }
        if (notFoundHeadlines.length > 0) {
-            messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
+            messages.push('Missing TOC headlines (by text): ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
+            messages.push('Found TOC headlines (by size): ' + foundBySize);
        }
        return new ParseResult({
            ...parseResult,
@ -148,7 +192,7 @@ export default class DetectTOC extends ToTextItemTransformation {

 }

-function findHeadline(page, tocLink) {
+function findAndAddHeadline(page, tocLink, headlineTypeToHeightRange) {
    const headline = tocLink.textItem.text;
    const headlineFinder = new HeadlineFinder({
        headline: headline
@ -158,12 +202,26 @@ function findHeadline(page, tocLink) {
        const headlineItems = headlineFinder.consume(line);
        if (headlineItems) {
            headlineItems.forEach(item => item.annotation = REMOVED_ANNOTATION);
+            const headlineType = headlineByLevel(tocLink.level + 2);
+            const headlineHeight = headlineItems.reduce((max, item) => Math.max(max, item.height), 0);
            page.items.splice(lineIndex + 1, 0, new TextItem({
                ...headlineItems[0],
                text: headline,
-                type: headlineByLevel(tocLink.level + 2),
+                height: headlineHeight,
+                type: headlineType,
                annotation: ADDED_ANNOTATION
            }));
+            var range = headlineTypeToHeightRange[headlineType];
+            if (range) {
+                range.min = Math.min(range.min, headlineHeight);
+                range.max = Math.max(range.max, headlineHeight);
+            } else {
+                range = {
+                    min: headlineHeight,
+                    max: headlineHeight
+                };
+                headlineTypeToHeightRange[headlineType] = range;
+            }
            return true;
        }
        lineIndex++;
@ -171,6 +229,20 @@ function findHeadline(page, tocLink) {
    return false;
 }

+function findHeadlinesBySize(pages, tocLink, heightRange, fromPage, toPage) {
+    for (var i = fromPage; i <= toPage; i++) {
+        const page = pages[i - 1];
+        for ( var line of page.items ) {
+            if (!line.type && !line.annotation && line.height >= heightRange.min && line.height <= heightRange.max) {
+                const match = wordMatch(tocLink.textItem.text, line.text);
+                if (match >= 0.5) {
+                    return line;
+                }
+            }
+        }
+    }
+}
+

 class LinkLeveler {
    constructor() {
--- a/test/functions.spec.js
+++ b/test/functions.spec.js
@ -1,6 +1,6 @@
 import { expect } from 'chai';

-import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem } from '../src/javascript/functions.jsx'
+import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, removeLeadingWhitespaces, charCodeArray, isListItem, isNumberedListItem, wordMatch } from '../src/javascript/functions.jsx'

 describe('hasUpperCaseCharacterInMiddleOfWord', () => {

@ -135,3 +135,20 @@ describe('isNumberedListItem', () => {
    });

 });
+
+describe('wordsMatch', () => {
+
+    it('Match', () => {
+        expect(wordMatch('text 1', 'text 1')).to.equal(1.0);
+        expect(wordMatch('text 1', 'text 2')).to.equal(0.5);
+        expect(wordMatch('text 1', 'text 1 2')).to.equal(0.6666666666666666);
+        expect(wordMatch('text 1 2 3', 'text 1 4 5')).to.equal(0.5);
+        expect(wordMatch('text 1 2 3', '5 1 4 text')).to.equal(0.5);
+        expect(wordMatch('text 1 2 3', 'text')).to.equal(0.25);
+
+        expect(wordMatch('text', 'test')).to.equal(0.0);
+
+        expect(wordMatch('inStruCtionS for the full Moon proCeSS', 'Instructions for the Full Moon Process')).to.equal(1.0);
+    });
+
+});