[WIP] TOC headline parsing

2024-11-22 07:43:46 +01:00 · 2017-03-07 18:42:14 +01:00 · 2017-03-07 18:42:14 +01:00 · 6f69566e98
commit 6f69566e98
parent c9352d8396
7 changed files with 360 additions and 55 deletions
--- a/src/javascript/functions.jsx
+++ b/src/javascript/functions.jsx
@ -1,5 +1,11 @@
 const MIN_DIGIT_CHAR_CODE = 48;
 const MAX_DIGIT_CHAR_CODE = 57;
 const WHITESPACE_CHAR_CODE = 32;
 const TAB_CHAR_CODE = 9;
 const DOT_CHAR_CODE = 46;
 export function isDigit(charCode) {
-    return charCode >= 48 && charCode <= 57;
+    return charCode >= MIN_DIGIT_CHAR_CODE && charCode <= MAX_DIGIT_CHAR_CODE;
 }
 export function isNumber(string) {
@ -27,3 +33,17 @@ export function hasUpperCaseCharacterInMiddleOfWord(text) {
    }
    return false;
 }
 // Remove whitespace/dots + to uppercase
 export function normalizedCharCodeArray(string) {
    string = string.toUpperCase();
    return charCodeArray(string).filter(charCode => charCode != WHITESPACE_CHAR_CODE && charCode != TAB_CHAR_CODE && charCode != DOT_CHAR_CODE);
 }
 export function charCodeArray(string) {
    const charCodes = [];
    for (var i = 0; i < string.length; i++) {
        charCodes.push(string.charCodeAt(i));
    }
    return charCodes;
 }
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -8,6 +8,7 @@ import DetectFootnotes from './transformations/DetectFootnotes.jsx'
 import DetectTOC from './transformations/DetectTOC.jsx'
 import DetectLists from './transformations/DetectLists.jsx'
 import DetectCodeBlocks from './transformations/DetectCodeBlocks.jsx'
 import DetectHeadlines from './transformations/DetectHeadlines.jsx'
 // import DetectFormats from './transformations/DetectFormats.jsx'
 // import CombineSameY from './transformations/CombineSameY.jsx';
 // import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
@ -35,6 +36,7 @@ export default class AppState {
            new DetectTOC(),
            new DetectLists(),
            new DetectCodeBlocks(),
            new DetectHeadlines(),
            // new DetectFormats(),
            // new CombineSameY(),
--- a/src/javascript/models/HeadlineFinder.jsx
+++ b/src/javascript/models/HeadlineFinder.jsx
@ -0,0 +1,40 @@
 import { normalizedCharCodeArray } from '../functions.jsx'
 export default class HeadlineFinder {
    constructor(options) {
        this.headlineCharCodes = normalizedCharCodeArray(options.headline);
        this.stackedTextItems = [];
        this.stackedChars = 0;
    }
    consume(textItem) {
        const normalizedCharCodes = normalizedCharCodeArray(textItem.text);
        const matchAll = this.matchAll(normalizedCharCodes);
        if (matchAll) {
            this.stackedTextItems.push(textItem);
            this.stackedChars += normalizedCharCodes.length;
            if (this.stackedChars == this.headlineCharCodes.length) {
                return this.stackedTextItems;
            }
        } else {
            if (this.stackedChars > 0) {
                this.stackedChars = 0;
                this.stackedTextItems = [];
                this.consume(textItem); // test again without stack
            }
        }
        return null;
    }
    matchAll(normalizedCharCodes) {
        for (var i = 0; i < normalizedCharCodes.length; i++) {
            const headlineChar = this.headlineCharCodes[this.stackedChars + i];
            const textItemChar = normalizedCharCodes[i];
            if (textItemChar != headlineChar) {
                return false;
            }
        }
        return true;
    }
 }
--- a/src/javascript/models/transformations/DetectLists.jsx
+++ b/src/javascript/models/transformations/DetectLists.jsx
@ -35,9 +35,9 @@ export default class DetectLists extends ToPdfBlockViewTransformation {
                            var lastItemX;
                            var currentLevel = 0;
                            var xByLevel = {};
                            var itemsBeforeFirstLineItem = [];
                            var listBlockItems = [];
                            var xByLevel = {};
                            const pushLineItem = (originalItem, text, setLevel) => {
                                if (lastItemX && setLevel) {
--- a/src/javascript/models/transformations/DetectTOC.jsx
+++ b/src/javascript/models/transformations/DetectTOC.jsx
@ -3,8 +3,9 @@ import ParseResult from '../ParseResult.jsx';
 import TextItem from '../TextItem.jsx';
 import PdfBlock from '../PdfBlock.jsx';
 import TextItemCombiner from '../TextItemCombiner.jsx';
 import HeadlineFinder from '../HeadlineFinder.jsx';
 import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../Annotation.jsx';
-import { TOC_BLOCK, HEADLINE2 } from '../MarkdownElements.jsx';
+import { TOC_BLOCK, HEADLINE2, headlineByLevel } from '../MarkdownElements.jsx';
 import { isDigit } from '../../functions.jsx'
 //Detect table of contents pages
@ -22,14 +23,16 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
            mostUsedDistance: mostUsedDistance
        });
-        var lastLevel = 0;
+        const linkLeveler = new LinkLeveler();
-        const itemLeveler = new ItemLeveler();
+        var tocLinks = [];
        var lastTocPage;
        parseResult.content.slice(0, maxPagesToEvaluate).forEach(page => {
            var linesCount = 0;
            var linesWithDigitsCount = 0;
            var lineItemsWithDigits = [];
            const unknownBlocks = new Set();
            var headlineBlock;
            const pageTocLinks = [];
            page.blocks.forEach(block => {
                var blockHasLinesWithDigits = false;
                const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
@ -38,8 +41,10 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                    linesCount++
                    var lineText = lineItem.text.replace(/\./g, '').trim();
                    var endsWithDigit = false;
                    var digits = [];
                    while (isDigit(lineText.charCodeAt(lineText.length - 1))) {
-                        lineText = lineText.substring(0, lineText.length - 2);
+                        digits.unshift(lineText.charAt(lineText.length - 1));
                        lineText = lineText.substring(0, lineText.length - 1);
                        endsWithDigit = true;
                    }
                    lineText = lineText.trim();
@ -50,6 +55,13 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                        }
                        linesWithDigitsCount++;
                        blockHasLinesWithDigits = true;
                        pageTocLinks.push(new TocLink({
                            pageNumber: parseInt(digits.join('')),
                            textItem: new TextItem({
                                ...lineItem,
                                text: lineText
                            })
                        }));
                        lineItemsWithDigits.push(new TextItem({
                            ...lineItem,
                            text: lineText
@ -67,8 +79,13 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                }
            });
            // page has been processed
            if (linesWithDigitsCount * 100 / linesCount > 75) {
                tocPages.push(page.index + 1);
                lastTocPage = page;
                linkLeveler.levelPageItems(pageTocLinks);
                tocLinks = tocLinks.concat(pageTocLinks);
                const newBlocks = [];
                page.blocks.forEach((block) => {
                    if (!unknownBlocks.has(block)) {
@ -83,17 +100,50 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                        }));
                    }
                });
                // lastLevel = processLevels(lineItemsWithDigits, lastLevel);
                itemLeveler.level(lineItemsWithDigits);
                newBlocks.push(new PdfBlock({
                    textItems: lineItemsWithDigits,
                    type: TOC_BLOCK,
                    annotation: ADDED_ANNOTATION
                }));
                page.blocks = newBlocks;
            }
        });
        //all  pages have been processed
        var foundHeadlines = tocLinks.length;
        const notFoundHeadlines = [];
        if (tocPages.length > 0) {
            tocLinks.forEach(tocLink => {
                var linkedPage = parseResult.content[tocLink.pageNumber - 1];
                var foundHeadline = false;
                if (linkedPage) {
                    foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
                    if (!foundHeadline) { // pages are off by 1 ?
                        linkedPage = parseResult.content[tocLink.pageNumber];
                        if (linkedPage) {
                            foundHeadline = findHeadline(linkedPage, tocLink, textCombiner);
                        }
                    }
                } else {
                    //TODO sometimes pages are off. We could try the page range from pre to next ...
                }
                if (!foundHeadline) {
                    notFoundHeadlines.push(tocLink);
                }
            });
            lastTocPage.blocks.push(new PdfBlock({
                textItems: tocLinks.map(tocLink => {
                    tocLink.textItem.text = ' '.repeat(tocLink.level * 3) + '- ' + tocLink.textItem.text;
                    return tocLink.textItem
                }),
                type: TOC_BLOCK,
                annotation: ADDED_ANNOTATION
            }));
        }
        const messages = [];
        messages.push('Detected ' + tocPages.length + ' table of content pages');
        if (foundHeadlines > 0) {
            messages.push('Found TOC headlines: ' + (foundHeadlines - notFoundHeadlines.length) + '/' + foundHeadlines);
        }
        if (notFoundHeadlines.length > 0) {
            messages.push('Missing TOC headlines: ' + notFoundHeadlines.map(tocLink => tocLink.textItem.text + '=>' + tocLink.pageNumber));
        }
        return new ParseResult({
            ...parseResult,
            globals: {
@ -101,27 +151,61 @@ export default class DetectTOC extends ToPdfBlockViewTransformation {
                tocPages: tocPages
            },
-            messages: ['Detected ' + tocPages.length + ' table of content pages']
+            messages: messages
        });
    }
 }
 function findHeadline(page, tocLink, textCombiner) {
    const headline = tocLink.textItem.text;
    const headlineFinder = new HeadlineFinder({
        headline: headline
    });
    var blockIndex = 0;
    var lastBlock;
    for ( var block of page.blocks ) {
        const itemsGroupedByY = textCombiner.combine(block.textItems).textItems;
        for ( var item of itemsGroupedByY ) {
            const headlineItems = headlineFinder.consume(item);
            if (headlineItems) {
                const usedItems = headlineFinder.stackedTextItems;
                block.annotation = REMOVED_ANNOTATION;
                if (usedItems.length > itemsGroupedByY.length) {
                    // 2 line headline
                    lastBlock.annotation = REMOVED_ANNOTATION;
                }
                page.blocks.splice(blockIndex + 1, 0, new PdfBlock({
                    textItems: [new TextItem({
                        ...usedItems[0],
                        text: headline
                    })],
                    type: headlineByLevel(tocLink.level + 2),
                    annotation: ADDED_ANNOTATION
                }));
                return true;
            }
        }
        blockIndex++;
        lastBlock = block;
    }
    return false;
 }
-class ItemLeveler {
+
 class LinkLeveler {
    constructor() {
        this.levelByMethod = null;
        this.uniqueFonts = [];
        this.headlines = [];
    }
-    level(lineItemsWithDigits) {
+    levelPageItems(tocLinks:TocLink[]) {
        if (!this.levelByMethod) {
-            const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
+            const uniqueX = this.calculateUniqueX(tocLinks);
            if (uniqueX.length > 1) {
                this.levelByMethod = this.levelByXDiff;
            } else {
-                const uniqueFonts = this.calculateUniqueFonts(lineItemsWithDigits);
+                const uniqueFonts = this.calculateUniqueFonts(tocLinks);
                if (uniqueFonts.length > 1) {
                    this.uniqueFonts = uniqueFonts;
                    this.levelByMethod = this.levelByFont;
@ -130,46 +214,31 @@ class ItemLeveler {
                }
            }
        }
-        this.levelByMethod(lineItemsWithDigits);
+        this.levelByMethod(tocLinks);
    }
-    levelByXDiff(lineItemsWithDigits) {
+    levelByXDiff(tocLinks) {
-        const uniqueX = this.calculateUniqueX(lineItemsWithDigits);
+        const uniqueX = this.calculateUniqueX(tocLinks);
-        lineItemsWithDigits.forEach(item => {
+        tocLinks.forEach(link => {
-            const level = uniqueX.indexOf(item.x);
+            link.level = uniqueX.indexOf(link.textItem.x);
            this.headlines.push(new Headline({
                level: level,
                text: item.text
            }));
            item.text = ' '.repeat(level * 3) + '- ' + item.text;
        });
    }
-    levelByFont(lineItemsWithDigits) {
+    levelByFont(tocLinks) {
-        lineItemsWithDigits.forEach(item => {
+        tocLinks.forEach(link => {
-            const level = this.uniqueFonts.indexOf(item.font);
+            link.level = this.uniqueFonts.indexOf(link.textItem.font);
            this.headlines.push(new Headline({
                level: level,
                text: item.text
            }));
            item.text = ' '.repeat(level * 3) + '- ' + item.text;
        });
    }
-    levelToZero(lineItemsWithDigits) {
+    levelToZero(tocLinks) {
-        lineItemsWithDigits.forEach(item => {
+        tocLinks.forEach(link => {
-            const level = 0;
+            link.level = 0;
            this.headlines.push(new Headline({
                level: level,
                text: item.text
            }));
            item.text = ' '.repeat(level * 3) + '- ' + item.text;
        });
    }
-    calculateUniqueX(lineItemsWithDigits) {
+    calculateUniqueX(tocLinks) {
-        var uniqueX = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
+        var uniqueX = tocLinks.reduce(function(uniquesArray, link) {
-            if (uniquesArray.indexOf(lineItem.x) < 0) uniquesArray.push(lineItem.x);
+            if (uniquesArray.indexOf(link.textItem.x) < 0) uniquesArray.push(link.textItem.x);
            return uniquesArray;
        }, []);
@ -180,9 +249,9 @@ class ItemLeveler {
        return uniqueX;
    }
-    calculateUniqueFonts(lineItemsWithDigits) {
+    calculateUniqueFonts(tocLinks) {
-        var uniqueFont = lineItemsWithDigits.reduce(function(uniquesArray, lineItem) {
+        var uniqueFont = tocLinks.reduce(function(uniquesArray, link) {
-            if (uniquesArray.indexOf(lineItem.font) < 0) uniquesArray.push(lineItem.font);
+            if (uniquesArray.indexOf(link.textItem.font) < 0) uniquesArray.push(link.textItem.font);
            return uniquesArray;
        }, []);
@ -191,9 +260,10 @@ class ItemLeveler {
 }
-class Headline {
+class TocLink {
    constructor(options) {
-        this.level = options.level;
+        this.textItem = options.textItem;
-        this.text = options.text;
+        this.pageNumber = options.pageNumber;
        this.level = 0;
    }
 }
--- a/test/HeadlineFinder.spec.js
+++ b/test/HeadlineFinder.spec.js
@ -0,0 +1,134 @@
 import { expect } from 'chai';
 import HeadlineFinder from '../src/javascript/models/HeadlineFinder';
 import TextItem from '../src/javascript/models/TextItem.jsx';
 describe('HeadlineFinder', () => {
    it('Not Found - Case 1', () => {
        const headlineFinder = new HeadlineFinder({
            headline: 'My Little Headline'
        });
        const item1 = new TextItem({
            text: 'My '
        });
        const item2 = new TextItem({
            text: 'Little'
        });
        const item3 = new TextItem({
            text: ' Headline2'
        });
        expect(headlineFinder.consume(item1)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
        expect(headlineFinder.consume(item2)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
        expect(headlineFinder.consume(item3)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
    });
    it('Found - Simple', () => {
        const headlineFinder = new HeadlineFinder({
            headline: 'My Little Headline'
        });
        const item1 = new TextItem({
            text: 'My '
        });
        const item2 = new TextItem({
            text: 'Little'
        });
        const item3 = new TextItem({
            text: ' Headline'
        });
        expect(headlineFinder.consume(item1)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
        expect(headlineFinder.consume(item2)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
    });
    it('Found - Waste in beginning', () => {
        const headlineFinder = new HeadlineFinder({
            headline: 'My Little Headline'
        });
        const item0 = new TextItem({
            text: 'Waste '
        });
        const item1 = new TextItem({
            text: 'My '
        });
        const item2 = new TextItem({
            text: 'Little'
        });
        const item3 = new TextItem({
            text: ' Headline'
        });
        expect(headlineFinder.consume(item0)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(0);
        expect(headlineFinder.consume(item1)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
        expect(headlineFinder.consume(item2)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
    });
    it('Found - Duplicate in beginning', () => {
        const headlineFinder = new HeadlineFinder({
            headline: 'My Little Headline'
        });
        const item0 = new TextItem({
            text: 'My '
        });
        const item1 = new TextItem({
            text: 'My '
        });
        const item2 = new TextItem({
            text: 'Little'
        });
        const item3 = new TextItem({
            text: ' Headline'
        });
        expect(headlineFinder.consume(item0)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item0);
        expect(headlineFinder.consume(item1)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
        expect(headlineFinder.consume(item2)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
    });
    it('Found - Mixed up case and Whitespace', () => {
        const headlineFinder = new HeadlineFinder({
            headline: 'MYLitt le HEADline'
        });
        const item1 = new TextItem({
            text: 'My '
        });
        const item2 = new TextItem({
            text: 'Little'
        });
        const item3 = new TextItem({
            text: ' Headline'
        });
        expect(headlineFinder.consume(item1)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(1).to.contain(item1);
        expect(headlineFinder.consume(item2)).to.equal(null);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(2).to.contain(item1).to.contain(item2);
        expect(headlineFinder.consume(item3)).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
        expect(headlineFinder.stackedTextItems).to.have.lengthOf(3).to.contain(item1).to.contain(item2).to.contain(item3);
    });
 });
--- a/test/functions.spec.js
+++ b/test/functions.spec.js
@ -1,6 +1,6 @@
 import { expect } from 'chai';
-import { hasUpperCaseCharacterInMiddleOfWord } from '../src/javascript/functions.jsx'
+import { hasUpperCaseCharacterInMiddleOfWord, normalizedCharCodeArray, charCodeArray } from '../src/javascript/functions.jsx'
 describe('hasUpperCaseCharacterInMiddleOfWord', () => {
@ -37,3 +37,42 @@ describe('hasUpperCaseCharacterInMiddleOfWord', () => {
        expect(hasUpperCaseCharacterInMiddleOfWord("High 5'Sec")).to.equal(true);
    });
 });
 describe('charCodeArray', () => {
    it('Charcodes', () => {
        expect(charCodeArray(".")).to.have.lengthOf(1).to.contain(46);
    });
    it('Convert Back', () => {
        expect(String.fromCharCode.apply(null, charCodeArray("word"))).to.equal("word");
        expect(String.fromCharCode.apply(null, charCodeArray("WORD"))).to.equal("WORD");
        expect(String.fromCharCode.apply(null, charCodeArray("a word"))).to.equal("a word");
    });
 });
 describe('normalizedCharCodeArray', () => {
    it('No Change', () => {
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD"))).to.equal("WORD");
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WORD23"))).to.equal("WORD23");
    });
    it('lowecaseToUpperCase', () => {
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("word"))).to.equal("WORD");
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WoRd"))).to.equal("WORD");
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("word23"))).to.equal("WORD23");
    });
    it('RemoveWhiteSpace', () => {
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("A WORD"))).to.equal("AWORD");
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("SOME LITTLE SENTENCE."))).to.equal("SOMELITTLESENTENCE");
    });
    it('All', () => {
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("a word"))).to.equal("AWORD");
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("WoRd 4 u"))).to.equal("WORD4U");
        expect(String.fromCharCode.apply(null, normalizedCharCodeArray("Some little sentence."))).to.equal("SOMELITTLESENTENCE");
    });
 });