Add not perfect headline detection

2025-07-13 04:25:11 +02:00 · 2017-02-05 09:58:25 +01:00
parent e90226c1d8
commit 0245ea16f1
2 changed files with 158 additions and 0 deletions
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -5,6 +5,7 @@ import RoundCoordinates from './transformations/RoundCoordinates.jsx';
 import CombineSameY from './transformations/CombineSameY.jsx';
 import DetectFootnotes from './transformations/DetectFootnotes.jsx'
 import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
 import HeadlineDetector from './transformations/HeadlineDetector.jsx'
 import ToTextPages from './transformations/ToTextPages.jsx';
 import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'
@ -22,6 +23,7 @@ export default class AppState {
            new CombineSameY(),
            new DetectFootnotes(),
            new RemoveRepetitiveElements(),
            new HeadlineDetector(),
            new ToTextPages(),
            new ToSingleTextPage()];
--- a/src/javascript/models/transformations/HeadlineDetector.jsx
+++ b/src/javascript/models/transformations/HeadlineDetector.jsx
@ -0,0 +1,156 @@
 import Transformation from './Transformation.jsx';
 import TextItem from '../TextItem.jsx';
 import PdfPage from '../PdfPage.jsx';
 import ContentView from '../ContentView.jsx';
 import Annotation from '../Annotation.jsx';
 function analyzeHeigths(pages) {
    const analyzationResult = {
        maxHeight: 0,
        maxYPerPage: {},
        heights: [],
        mostUsedHeight: -1
    };
    const allHeights = new Set();
    pages.forEach(page => {
        var maxPageY = 0;
        page.textItems.forEach(item => {
            const height = item.height;
            allHeights.add(height);
            if (analyzationResult[height]) {
                analyzationResult[height].repetition = analyzationResult[height].repetition + 1;
                analyzationResult[height].pages.add(page.index);
            } else {
                analyzationResult[height] = {
                    repetition: 1,
                    pages: new Set([page.index])
                };
            }
            maxPageY = Math.max(maxPageY, item.y);
            analyzationResult.maxHeight = Math.max(analyzationResult.maxHeight, item.height);
        });
        analyzationResult.maxYPerPage[page.index] = maxPageY;
    });
    var maxRepetition = 0;
    allHeights.forEach(height => {
        const heightRepetition = analyzationResult[height].repetition;
        analyzationResult.heights.push(height);
        if (heightRepetition > maxRepetition) {
            maxRepetition = heightRepetition;
            analyzationResult.mostUsedHeight = height;
        }
    });
    analyzationResult.heights = analyzationResult.heights.sort((a, b) => a - b);
    return analyzationResult;
 }
 function findNextMajorHeight(heights, currentHeight, headlineMap) {
    for (var i = currentHeight; i < heights.length; i++) {
        if (headlineMap[heights[i]]) {
            return heights[i];
        }
    }
    throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineMap=${headlineMap}`;
 }
 export default class HeadlineDetector extends Transformation {
    constructor() {
        super("Detect Headlines");
    }
    contentView() {
        return ContentView.PDF;
    }
    // Strategy:
    // - find most used height => this & every height below is paragraph
    // - heights which start a page are likely to be headlines
    // - maxHeigth is likely a headline
    // - heights which occur on more then one page are likely to be headlines
    transform(pages:PdfPage[]) {
        const heightAnalyzation = analyzeHeigths(pages);
        var paragraphHeight = heightAnalyzation.mostUsedHeight + 1;
        // text with more hight then the paragraph height which are on the top of the page are likely to be headlines
        const likelyHeadingHeights = new Set();
        pages.forEach(page => {
            page.textItems.forEach(item => {
                if (item.height > paragraphHeight && heightAnalyzation.maxYPerPage[page.index] == item.y) {
                    likelyHeadingHeights.add(item.height);
                }
            });
        });
        const headlineHeights = [];
        heightAnalyzation.heights.forEach(height => {
            if (height == heightAnalyzation.maxHeight || (height > paragraphHeight && likelyHeadingHeights.has(height) && heightAnalyzation[height].pages.size > 1)) {
                headlineHeights.push(height);
            }
        });
        const headlineMap = {};
        headlineHeights.reverse().forEach((height, i) => headlineMap[height] = '#'.repeat(i + 1));
        var lastMajorHeight = paragraphHeight;
        var heights = heightAnalyzation.heights;
        for (var i = 0; i < heights.length; i++) {
            if (heights[i] > paragraphHeight && !headlineMap[heights[i]]) {
                const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineMap);
                const distanceToLower = heights[i] - lastMajorHeight;
                const distanceToHigher = nextMajorHeight - heights[i];
                if (distanceToLower <= distanceToHigher) {
                    if (lastMajorHeight == paragraphHeight) {
                        paragraphHeight++;
                    } else {
                        headlineMap[heights[i]] = headlineMap[lastMajorHeight];
                    }
                } else {
                    headlineMap[heights[i]] = headlineMap[nextMajorHeight];
                }
            }
            if (headlineMap[heights[i]]) {
                lastMajorHeight = heights[i];
            }
        }
        return pages.map(page => {
            const newTextItems = [];
            page.textItems.forEach(item => {
                if (item.height <= paragraphHeight) {
                    newTextItems.push(item);
                } else {
                    newTextItems.push(new TextItem({
                        ...item,
                        text: item.text,
                        annotation: new Annotation({
                            category: headlineMap[item.height],
                            color: 'green'
                        })
                    }));
                }
            });
            return {
                ...page,
                textItems: newTextItems
            };
        });
    }
    processAnnotations(pages:PdfPage[]) {
        pages.forEach(page => {
            page.textItems.forEach(item => {
                if (item.annotation) {
                    item.text = item.annotation.category + ' ' + item.text;
                }
            });
        });
        return pages;
    }
 }