[WIP] TOC headline parsing

2025-06-26 04:21:40 +02:00 · 2017-03-07 18:42:14 +01:00 · 2017-03-07 18:42:14 +01:00 · 6f69566e98
commit 6f69566e98
parent c9352d8396
7 changed files with 360 additions and 55 deletions
--- a/src/javascript/functions.jsx
+++ b/src/javascript/functions.jsx
@ -1,5 +1,11 @@
+const MIN_DIGIT_CHAR_CODE = 48;
+const MAX_DIGIT_CHAR_CODE = 57;
+const WHITESPACE_CHAR_CODE = 32;
+const TAB_CHAR_CODE = 9;
+const DOT_CHAR_CODE = 46;
+
 export function isDigit(charCode) {
-    return charCode >= 48 && charCode <= 57;
+    return charCode >= MIN_DIGIT_CHAR_CODE && charCode <= MAX_DIGIT_CHAR_CODE;
 }

 export function isNumber(string) {
@ -27,3 +33,17 @@ export function hasUpperCaseCharacterInMiddleOfWord(text) {
    }
    return false;
 }
+
+// Remove whitespace/dots + to uppercase
+export function normalizedCharCodeArray(string) {
+    string = string.toUpperCase();
+    return charCodeArray(string).filter(charCode => charCode != WHITESPACE_CHAR_CODE && charCode != TAB_CHAR_CODE && charCode != DOT_CHAR_CODE);
+}
+
+export function charCodeArray(string) {
+    const charCodes = [];
+    for (var i = 0; i < string.length; i++) {
+        charCodes.push(string.charCodeAt(i));
+    }
+    return charCodes;
+}
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -8,6 +8,7 @@ import DetectFootnotes from './transformations/DetectFootnotes.jsx'
 import DetectTOC from './transformations/DetectTOC.jsx'
 import DetectLists from './transformations/DetectLists.jsx'
 import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
+import DetectHeadlines from './transformations/DetectHeadlines.jsx'
 // import DetectFormats from './transformations/DetectFormats.jsx'
 // import CombineSameY from './transformations/CombineSameY.jsx';
 // import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
@ -35,6 +36,7 @@ export default class AppState {
            new DetectTOC(),
            new DetectLists(),
            new DetectCodeBlocks(),
+            new DetectHeadlines(),

            // new DetectFormats(),
            // new CombineSameY(),
--- a/src/javascript/models/HeadlineFinder.jsx
+++ b/src/javascript/models/HeadlineFinder.jsx
@ -0,0 +1,40 @@
+import { normalizedCharCodeArray } from '../functions.jsx'
+
+export default class HeadlineFinder {
+
+    constructor(options) {
+        this.headlineCharCodes = normalizedCharCodeArray(options.headline);
+        this.stackedTextItems = [];
+        this.stackedChars = 0;
+    }
+
+    consume(textItem) {
+        const normalizedCharCodes = normalizedCharCodeArray(textItem.text);
+        const matchAll = this.matchAll(normalizedCharCodes);
+        if (matchAll) {
+            this.stackedTextItems.push(textItem);
+            this.stackedChars += normalizedCharCodes.length;
+            if (this.stackedChars == this.headlineCharCodes.length) {
+                return this.stackedTextItems;
+            }
+        } else {
+            if (this.stackedChars > 0) {
+                this.stackedChars = 0;
+                this.stackedTextItems = [];
+                this.consume(textItem); // test again without stack
+            }
+        }
+        return null;
+    }
+
+    matchAll(normalizedCharCodes) {
+        for (var i = 0; i < normalizedCharCodes.length; i++) {
+            const headlineChar = this.headlineCharCodes[this.stackedChars + i];
+            const textItemChar = normalizedCharCodes[i];
+            if (textItemChar != headlineChar) {
+                return false;
+            }
+        }
+        return true;
+    }
+}
--- a/src/javascript/models/transformations/DetectLists.jsx
+++ b/src/javascript/models/transformations/DetectLists.jsx
@ -35,9 +35,9 @@ export default class DetectLists extends ToPdfBlockViewTransformation {

                            var lastItemX;
                            var currentLevel = 0;
+                            var xByLevel = {};
                            var itemsBeforeFirstLineItem = [];
                            var listBlockItems = [];
-                            var xByLevel = {};

                            const pushLineItem = (originalItem, text, setLevel) => {
                                if (lastItemX && setLevel) {
--- a/src/javascript/models/transformations/DetectTOC.jsx
+++ b/src/javascript/models/transformations/DetectTOC.jsx
@ -3,8 +3,9 @@ import ParseResult from '../ParseResult.jsx';
 import TextItem from '../TextItem.jsx';
 import PdfBlock from '../PdfBlock.jsx';
 import TextItemCombiner from '../TextItemCombiner.jsx';
+import HeadlineFinder from '../HeadlineFinder.jsx';
 import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
-import { TOC_BLOCK, HEADLINE2 } from '../MarkdownElements.jsx';
+import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
 import { isDigit } from '../../functions.jsx'

 //Detect table of contents pages
@ -22,14 +23,16 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
            mostUsedDistance: mostUsedDistance
        });

-        var lastLevel = 0;
-        const itemLeveler = new ItemLeveler();
+        const linkLeveler = new LinkLeveler();
+        var tocLinks = [];
+        var lastTocPage;
        parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
            var linesCount = 0;
            var linesWithDigitsCount = 0;
            var lineItemsWithDigits = [];
            const unknownBlocks = new Set();
            var headlineBlock;
+            const pageTocLinks = [];
            page.blocks.forEach(block => {
                var blockHasLinesWithDigits = false;
                const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
@ -38,8 +41,10 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                    linesCount++
                    var lineText = lineItem.text.replace(/\./g, '').trim();
                    var endsWithDigit = false;
+                    var digits = [];
                    while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
-                        lineText = lineText.substring(0, lineText.length - 2);
+                        digits.unshift(lineText.charAt(lineText.length - 1));
+                        lineText = lineText.substring(0, lineText.length - 1);
                        endsWithDigit = true;
                    }
                    lineText = lineText.trim();
@ -50,6 +55,13 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                        }
                        linesWithDigitsCount++;
                        blockHasLinesWithDigits = true;
+                        pageTocLinks.push(new TocLink({
+                            pageNumber: parseInt(digits.join('')),
+                            textItem: new TextItem({
+                                ...lineItem,
+                                text: lineText
+                            })
+                        }));
                        lineItemsWithDigits.push(new TextItem({
                            ...lineItem,
                            text: lineText
@ -67,8 +79,13 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                }
            });

+            // page has been processed
            if (linesWithDigitsCount * 100 / linesCount > 75) {
                tocPages.push(page.index + 1);
+                lastTocPage = page;
+                linkLeveler.levelPageItems(pageTocLinks);
+                tocLinks = tocLinks.concat(pageTocLinks);
+
                const newBlocks = [];
                page.blocks.forEach((block) => {
                    if (!unknownBlocks.has(block)) {
@ -83,17 +100,50 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                        }));
                    }
                });
-                // lastLevel = processLevels(lineItemsWithDigits, lastLevel);
-                itemLeveler.level(lineItemsWithDigits);
-                newBlocks.push(new PdfBlock({
-                    textItems: lineItemsWithDigits,
-                    type: TOC_BLOCK,
-                    annotation: ADDED_ANNOTATION
-                }));
                page.blocks = newBlocks;
            }
        });

+        //all  pages have been processed
+        var foundHeadlines = tocLinks.length;
+        const notFoundHeadlines = [];
+        if (tocPages.length > 0) {
+            tocLinks.forEach(tocLink => {
+                var linkedPage = parseResult.content[tocLink.pageNumber - 1];
+                var foundHeadline = false;
+                if (linkedPage) {
+                    foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
+                    if (!foundHeadline) { // pages are off by 1 ?
+                        linkedPage = parseResult.content[tocLink.pageNumber];
+                        if (linkedPage) {
+                            foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
+                        }
+                    }
+                } else {
+                    //TODO sometimes pages are off. We could try the page range from pre to next ...
+                }
+                if (!foundHeadline) {
+                    notFoundHeadlines.push(tocLink);
+                }
+            });
+            lastTocPage.blocks.push(new PdfBlock({
+                textItems: tocLinks.map(tocLink => {
+                    tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
+                    return tocLink.textItem
+                }),
+                type: TOC_BLOCK,
+                annotation: ADDED_ANNOTATION
+            }));
+        }
+
+        const messages = [];
+        messages.push('Detected ' + tocPages.length + ' table of content pages');
+        if (foundHeadlines > 0) {
+            messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
+        }
+        if (notFoundHeadlines.length > 0) {
+            messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
+        }
        return new ParseResult({
            ...parseResult,
            globals: {
@ -101,27 +151,61 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                tocPages: tocPages

            },
-            messages: ['Detected ' + tocPages.length + ' table of content pages']
+            messages: messages
        });
    }

 }

+function findHeadline(page, tocLink, textCombiner) {
+    const headline = tocLink.textItem.text;
+    const headlineFinder = new HeadlineFinder({
+        headline: headline
+    });
+    var blockIndex = 0;
+    var lastBlock;
+    for ( var block of page.blocks ) {
+        const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
+        for ( var item of itemsGroupedByY ) {
+            const headlineItems = headlineFinder.consume(item);
+            if (headlineItems) {
+                const usedItems = headlineFinder.stackedTextItems;
+                block.annotation = REMOVED_ANNOTATION;
+                if (usedItems.length > itemsGroupedByY.length) {
+                    // 2 line headline
+                    lastBlock.annotation = REMOVED_ANNOTATION;
+                }
+                page.blocks.splice(blockIndex + 1, 0, new PdfBlock({
+                    textItems: [new TextItem({
+                        ...usedItems[0],
+                        text: headline
+                    })],
+                    type: headlineByLevel(tocLink.level + 2),
+                    annotation: ADDED_ANNOTATION
+                }));
+                return true;
+            }
+        }
+        blockIndex++;
+        lastBlock = block;
+    }
+    return false;
+}

-class ItemLeveler {
+
+class LinkLeveler {
    constructor() {
        this.levelByMethod = null;
        this.uniqueFonts = [];
-        this.headlines = [];
    }

-    level(lineItemsWithDigits) {
+    levelPageItems(tocLinks:TocLink[]) {
        if (!this.levelByMethod) {
-            const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
+            const uniqueX = this.calculateUniqueX(tocLinks);
            if (uniqueX.length > 1) {
                this.levelByMethod = this.levelByXDiff;
            } else {
-                const uniqueFonts = this.calculateUniqueFonts(lineItemsWithDigits);
+                const uniqueFonts = this.calculateUniqueFonts(tocLinks);
                if (uniqueFonts.length > 1) {
                    this.uniqueFonts = uniqueFonts;
                    this.levelByMethod = this.levelByFont;
@ -130,46 +214,31 @@ class ItemLeveler {
                }
            }
        }
-        this.levelByMethod(lineItemsWithDigits);
+        this.levelByMethod(tocLinks);
    }

-    levelByXDiff(lineItemsWithDigits) {
-        const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
-        lineItemsWithDigits.forEach(item => {
-            const level = uniqueX.indexOf(item.x);
-            this.headlines.push(new Headline({
-                level: level,
-                text: item.text
-            }));
-            item.text = ' '.repeat(level * 3) + '- ' + item.text;
+    levelByXDiff(tocLinks) {
+        const uniqueX = this.calculateUniqueX(tocLinks);
+        tocLinks.forEach(link => {
+            link.level = uniqueX.indexOf(link.textItem.x);
        });
    }

-    levelByFont(lineItemsWithDigits) {
-        lineItemsWithDigits.forEach(item => {
-            const level = this.uniqueFonts.indexOf(item.font);
-            this.headlines.push(new Headline({
-                level: level,
-                text: item.text
-            }));
-            item.text = ' '.repeat(level * 3) + '- ' + item.text;
+    levelByFont(tocLinks) {
+        tocLinks.forEach(link => {
+            link.level = this.uniqueFonts.indexOf(link.textItem.font);
        });
    }

-    levelToZero(lineItemsWithDigits) {
-        lineItemsWithDigits.forEach(item => {
-            const level = 0;
-            this.headlines.push(new Headline({
-                level: level,
-                text: item.text
-            }));
-            item.text = ' '.repeat(level * 3) + '- ' + item.text;
+    levelToZero(tocLinks) {
+        tocLinks.forEach(link => {
+            link.level = 0;
        });
    }

-    calculateUniqueX(lineItemsWithDigits) {
-        var uniqueX = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
-            if (uniquesArray.indexOf(lineItem.x) < 0) uniquesArray.push(lineItem.x);
+    calculateUniqueX(tocLinks) {
+        var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
+            if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
            return uniquesArray;
        }, []);

@ -180,9 +249,9 @@ class ItemLeveler {
        return uniqueX;
    }

-    calculateUniqueFonts(lineItemsWithDigits) {
-        var uniqueFont = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
-            if (uniquesArray.indexOf(lineItem.font) < 0) uniquesArray.push(lineItem.font);
+    calculateUniqueFonts(tocLinks) {
+        var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
+            if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
            return uniquesArray;
        }, []);

@ -191,9 +260,10 @@ class ItemLeveler {

 }

-class Headline {
+class TocLink {
    constructor(options) {
-        this.level = options.level;
-        this.text = options.text;
+        this.textItem = options.textItem;
+        this.pageNumber = options.pageNumber;
+        this.level = 0;
    }
 }
--- a/test/HeadlineFinder.spec.js
+++ b/test/HeadlineFinder.spec.js
@ -0,0 +1,134 @@
+import { expect } from 'chai';
+
+import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
+import TextItem from '../src/javascript/models/TextItem.jsx';
+
+describe('HeadlineFinder', () => {
+
+
+    it('Not Found - Case 1', () => {
+        const headlineFinder = new HeadlineFinder({
+            headline: 'My Little Headline'
+        });
+        const item1 = new TextItem({
+            text: 'My '
+        });
+        const item2 = new TextItem({
+            text: 'Little'
+        });
+        const item3 = new TextItem({
+            text: ' Headline2'
+        });
+
+        expect(headlineFinder.consume(item1)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+        expect(headlineFinder.consume(item2)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+        expect(headlineFinder.consume(item3)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
+
+    });
+
+    it('Found - Simple', () => {
+        const headlineFinder = new HeadlineFinder({
+            headline: 'My Little Headline'
+        });
+        const item1 = new TextItem({
+            text: 'My '
+        });
+        const item2 = new TextItem({
+            text: 'Little'
+        });
+        const item3 = new TextItem({
+            text: ' Headline'
+        });
+
+        expect(headlineFinder.consume(item1)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+        expect(headlineFinder.consume(item2)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+
+    });
+
+    it('Found - Waste in beginning', () => {
+        const headlineFinder = new HeadlineFinder({
+            headline: 'My Little Headline'
+        });
+        const item0 = new TextItem({
+            text: 'Waste '
+        });
+        const item1 = new TextItem({
+            text: 'My '
+        });
+        const item2 = new TextItem({
+            text: 'Little'
+        });
+        const item3 = new TextItem({
+            text: ' Headline'
+        });
+
+        expect(headlineFinder.consume(item0)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
+        expect(headlineFinder.consume(item1)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+        expect(headlineFinder.consume(item2)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+
+    });
+
+    it('Found - Duplicate in beginning', () => {
+        const headlineFinder = new HeadlineFinder({
+            headline: 'My Little Headline'
+        });
+        const item0 = new TextItem({
+            text: 'My '
+        });
+        const item1 = new TextItem({
+            text: 'My '
+        });
+        const item2 = new TextItem({
+            text: 'Little'
+        });
+        const item3 = new TextItem({
+            text: ' Headline'
+        });
+
+        expect(headlineFinder.consume(item0)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0);
+        expect(headlineFinder.consume(item1)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+        expect(headlineFinder.consume(item2)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+
+    });
+
+    it('Found - Mixed up case and Whitespace', () => {
+        const headlineFinder = new HeadlineFinder({
+            headline: 'MYLitt le HEADline'
+        });
+        const item1 = new TextItem({
+            text: 'My '
+        });
+        const item2 = new TextItem({
+            text: 'Little'
+        });
+        const item3 = new TextItem({
+            text: ' Headline'
+        });
+
+        expect(headlineFinder.consume(item1)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
+        expect(headlineFinder.consume(item2)).to.equal(null);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
+        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
+
+    });
+
+});
--- a/test/functions.spec.js
+++ b/test/functions.spec.js
@ -1,6 +1,6 @@
 import { expect } from 'chai';

-import { hasUpperCaseCharacterInMiddleOfWord } from '../src/javascript/functions.jsx'
+import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, charCodeArray } from '../src/javascript/functions.jsx'

 describe('hasUpperCaseCharacterInMiddleOfWord', () => {

@ -37,3 +37,42 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
        expect(hasUpperCaseCharacterInMiddleOfWord("High 5'Sec")).to.equal(true);
    });
 });
+
+describe('charCodeArray', () => {
+    it('Charcodes', () => {
+        expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
+    });
+
+    it('Convert Back', () => {
+        expect(String.fromCharCode.apply(null, charCodeArray("word"))).to.equal("word");
+        expect(String.fromCharCode.apply(null, charCodeArray("WORD"))).to.equal("WORD");
+        expect(String.fromCharCode.apply(null, charCodeArray("a word"))).to.equal("a word");
+    });
+
+});
+
+describe('normalizedCharCodeArray', () => {
+
+    it('No Change', () => {
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD23"))).to.equal("WORD23");
+    });
+
+    it('lowecaseToUpperCase', () => {
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("word"))).to.equal("WORD");
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WoRd"))).to.equal("WORD");
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("word23"))).to.equal("WORD23");
+    });
+
+    it('RemoveWhiteSpace', () => {
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("A WORD"))).to.equal("AWORD");
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("SOME LITTLE SENTENCE."))).to.equal("SOMELITTLESENTENCE");
+    });
+
+    it('All', () => {
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("a word"))).to.equal("AWORD");
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WoRd 4 u"))).to.equal("WORD4U");
+        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("Some little sentence."))).to.equal("SOMELITTLESENTENCE");
+    });
+
+});