Add not perfect headline detection

2025-06-24 11:33:49 +02:00 · 2017-02-05 09:58:25 +01:00 · 2017-02-05 09:58:25 +01:00 · 0245ea16f1
commit 0245ea16f1
parent e90226c1d8
2 changed files with 158 additions and 0 deletions
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -5,6 +5,7 @@ import RoundCoordinates from './transformations/RoundCoordinates.jsx';
 import CombineSameY from './transformations/CombineSameY.jsx';
 import DetectFootnotes from './transformations/DetectFootnotes.jsx'
 import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
+import HeadlineDetector from './transformations/HeadlineDetector.jsx'
 import ToTextPages from './transformations/ToTextPages.jsx';
 import ToSingleTextPage from './transformations/ToSingleTextPage.jsx'

@ -22,6 +23,7 @@ export default class AppState {
            new CombineSameY(),
            new DetectFootnotes(),
            new RemoveRepetitiveElements(),
+            new HeadlineDetector(),
            new ToTextPages(),
            new ToSingleTextPage()];

--- a/src/javascript/models/transformations/HeadlineDetector.jsx
+++ b/src/javascript/models/transformations/HeadlineDetector.jsx
@ -0,0 +1,156 @@
+import Transformation from './Transformation.jsx';
+import TextItem from '../TextItem.jsx';
+import PdfPage from '../PdfPage.jsx';
+import ContentView from '../ContentView.jsx';
+import Annotation from '../Annotation.jsx';
+
+
+function analyzeHeigths(pages) {
+    const analyzationResult = {
+        maxHeight: 0,
+        maxYPerPage: {},
+        heights: [],
+        mostUsedHeight: -1
+    };
+    const allHeights = new Set();
+    pages.forEach(page => {
+        var maxPageY = 0;
+        page.textItems.forEach(item => {
+            const height = item.height;
+            allHeights.add(height);
+            if (analyzationResult[height]) {
+                analyzationResult[height].repetition = analyzationResult[height].repetition + 1;
+                analyzationResult[height].pages.add(page.index);
+            } else {
+                analyzationResult[height] = {
+                    repetition: 1,
+                    pages: new Set([page.index])
+                };
+            }
+            maxPageY = Math.max(maxPageY, item.y);
+            analyzationResult.maxHeight = Math.max(analyzationResult.maxHeight, item.height);
+        });
+        analyzationResult.maxYPerPage[page.index] = maxPageY;
+    });
+
+    var maxRepetition = 0;
+    allHeights.forEach(height => {
+        const heightRepetition = analyzationResult[height].repetition;
+        analyzationResult.heights.push(height);
+        if (heightRepetition > maxRepetition) {
+            maxRepetition = heightRepetition;
+            analyzationResult.mostUsedHeight = height;
+        }
+    });
+    analyzationResult.heights = analyzationResult.heights.sort((a, b) => a - b);
+
+    return analyzationResult;
+}
+
+function findNextMajorHeight(heights, currentHeight, headlineMap) {
+    for (var i = currentHeight; i < heights.length; i++) {
+        if (headlineMap[heights[i]]) {
+            return heights[i];
+        }
+    }
+    throw `Shouldn't happen! heights=${heights}, currentHeight=${currentHeight}, headlineMap=${headlineMap}`;
+}
+
+
+export default class HeadlineDetector extends Transformation {
+
+    constructor() {
+        super("Detect Headlines");
+    }
+
+    contentView() {
+        return ContentView.PDF;
+    }
+
+    // Strategy:
+    // - find most used height => this & every height below is paragraph
+    // - heights which start a page are likely to be headlines
+    // - maxHeigth is likely a headline
+    // - heights which occur on more then one page are likely to be headlines
+    transform(pages:PdfPage[]) {
+        const heightAnalyzation = analyzeHeigths(pages);
+
+        var paragraphHeight = heightAnalyzation.mostUsedHeight + 1;
+
+        // text with more hight then the paragraph height which are on the top of the page are likely to be headlines
+        const likelyHeadingHeights = new Set();
+        pages.forEach(page => {
+            page.textItems.forEach(item => {
+                if (item.height > paragraphHeight && heightAnalyzation.maxYPerPage[page.index] == item.y) {
+                    likelyHeadingHeights.add(item.height);
+                }
+            });
+        });
+
+        const headlineHeights = [];
+        heightAnalyzation.heights.forEach(height => {
+            if (height == heightAnalyzation.maxHeight || (height > paragraphHeight && likelyHeadingHeights.has(height) && heightAnalyzation[height].pages.size > 1)) {
+                headlineHeights.push(height);
+            }
+        });
+
+
+        const headlineMap = {};
+        headlineHeights.reverse().forEach((height, i) => headlineMap[height] = '#'.repeat(i + 1));
+        var lastMajorHeight = paragraphHeight;
+        var heights = heightAnalyzation.heights;
+        for (var i = 0; i < heights.length; i++) {
+            if (heights[i] > paragraphHeight && !headlineMap[heights[i]]) {
+                const nextMajorHeight = findNextMajorHeight(heights, i + 1, headlineMap);
+                const distanceToLower = heights[i] - lastMajorHeight;
+                const distanceToHigher = nextMajorHeight - heights[i];
+                if (distanceToLower <= distanceToHigher) {
+                    if (lastMajorHeight == paragraphHeight) {
+                        paragraphHeight++;
+                    } else {
+                        headlineMap[heights[i]] = headlineMap[lastMajorHeight];
+                    }
+                } else {
+                    headlineMap[heights[i]] = headlineMap[nextMajorHeight];
+                }
+            }
+            if (headlineMap[heights[i]]) {
+                lastMajorHeight = heights[i];
+            }
+        }
+
+        return pages.map(page => {
+            const newTextItems = [];
+            page.textItems.forEach(item => {
+                if (item.height <= paragraphHeight) {
+                    newTextItems.push(item);
+                } else {
+                    newTextItems.push(new TextItem({
+                        ...item,
+                        text: item.text,
+                        annotation: new Annotation({
+                            category: headlineMap[item.height],
+                            color: 'green'
+                        })
+                    }));
+                }
+            });
+            return {
+                ...page,
+                textItems: newTextItems
+            };
+        });
+    }
+
+    processAnnotations(pages:PdfPage[]) {
+        pages.forEach(page => {
+            page.textItems.forEach(item => {
+                if (item.annotation) {
+                    item.text = item.annotation.category + ' ' + item.text;
+                }
+            });
+        });
+        return pages;
+    }
+
+}