[WIP] move unused stuff in separate folder

2025-06-24 19:41:24 +02:00 · 2017-03-10 06:30:18 +01:00 · 2017-03-10 06:30:18 +01:00 · e2ddf0312b
commit e2ddf0312b
parent 111124fbf3
10 changed files with 205 additions and 0 deletions
--- a/src/javascript/models/HeaderLevelAssigner.jsx
+++ b/src/javascript/models/HeaderLevelAssigner.jsx
@ -0,0 +1,28 @@
+
+// Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
+// Levels are from 1..6, where 1 is the biggest headline.
+// HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
+export default class HeaderLevelAssigner {
+
+    constructor(options) {
+        this.startLevel = options.startLevel;
+        this.paragraphHeight = options.paragraphHeight;
+        this.lastLevel = null;
+        this.lastHeight = null;
+        this.heightToLevel = {};
+    }
+
+    add(height) {
+        if (!this.lastHeight) {
+            this.lastLevel = this.startLevel;
+            this.heightToLevel[height] = this.startLevel;
+        } else {
+            const existingLevel = this.heightToLevel[height];
+            if (!existingLevel) {
+                //
+            }
+        }
+
+        this.lastHeight = height;
+    }
+}
--- a/src/javascript/models/transformations/old/CombineSameY.jsx
+++ b/src/javascript/models/transformations/old/CombineSameY.jsx
--- a/src/javascript/models/transformations/old/DetectFootnoteOld.jsx
+++ b/src/javascript/models/transformations/old/DetectFootnoteOld.jsx
@ -0,0 +1,70 @@
+import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
+import TextItem from '../TextItem.jsx';
+import ParseResult from '../ParseResult.jsx';
+import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
+
+import { isNumber } from '../../functions.jsx'
+
+export default class DetectFootnoteOld extends ToTextItemBlockTransformation {
+
+    constructor() {
+        super("Detect Footnote ");
+    }
+
+    transform(parseResult:ParseResult) {
+
+        var nextFooterNumber = 1;
+        var potentialFootnoteItem;
+        var foundFootnotes = 0;
+
+        const newContent = parseResult.content.map(page => {
+            const newTextItems = [];
+            for (var i = 0; i < page.textItems.length; i++) {
+                const item = page.textItems[i];
+                if (potentialFootnoteItem) {
+                    if (potentialFootnoteItem.y - item.y < item.height) {
+                        potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
+                        item.annotation = REMOVED_ANNOTATION;
+                        newTextItems.push(potentialFootnoteItem);
+                        newTextItems.push(item);
+                        newTextItems.push(new TextItem({
+                            x: potentialFootnoteItem.x,
+                            y: item.y,
+                            width: potentialFootnoteItem.width + item.width,
+                            height: item.height,
+                            text: '[' + potentialFootnoteItem.text + '] ' + item.text,
+                            annotation: ADDED_ANNOTATION
+                        }));
+                        //TODO repsect multiline!!
+                        nextFooterNumber++;
+                        foundFootnotes++;
+                    }
+                    potentialFootnoteItem = null;
+                } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
+                    potentialFootnoteItem = item;
+                } else {
+                    newTextItems.push(item);
+                }
+            }
+            return {
+                ...page,
+                textItems: newTextItems
+            };
+        });
+
+        return new ParseResult({
+            ...parseResult,
+            content: newContent,
+            messages: ['Detected ' + foundFootnotes + ' footnotes']
+        });
+    }
+
+    completeTransform(parseResult:ParseResult) {
+        parseResult.content.forEach(page => {
+            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
+            page.textItems.forEach(textItem => textItem.annotation = null)
+        });
+        return parseResult;
+    }
+
+}
--- a/src/javascript/models/transformations/old/DetectFormats.jsx
+++ b/src/javascript/models/transformations/old/DetectFormats.jsx
--- a/src/javascript/models/transformations/old/DetectLinks.jsx
+++ b/src/javascript/models/transformations/old/DetectLinks.jsx
--- a/src/javascript/models/transformations/old/HeadlineDetector.jsx
+++ b/src/javascript/models/transformations/old/HeadlineDetector.jsx
--- a/src/javascript/models/transformations/old/HeadlineDetector2.jsx
+++ b/src/javascript/models/transformations/old/HeadlineDetector2.jsx
@ -0,0 +1,107 @@
+import Transformation from './Transformation.jsx';
+import TextItem from '../TextItem.jsx';
+import PdfPage from '../PdfPage.jsx';
+import ContentView from '../ContentView.jsx';
+import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
+
+import Headline from '../markdown/Headline.jsx';
+
+function getMostUsedHeight(heightToOccurrence) {
+    var maxOccurence = 0;
+    var maxHeight = 0;
+    Object.keys(heightToOccurrence).map((element) => {
+        if (heightToOccurrence[element] > maxOccurence) {
+            maxOccurence = heightToOccurrence[element];
+            maxHeight = element;
+        }
+    });
+    return parseInt(maxHeight);
+}
+
+
+export default class HeadlineDetector extends Transformation {
+
+    constructor() {
+        super("Detect Headlines");
+    }
+
+    contentView() {
+        return ContentView.PDF;
+    }
+
+    // Strategy:
+    // - find most used height => this & every height below is paragraph
+    // - heights which start a page are likely to be headlines
+    // - maxHeigth is likely a headline
+    // - heights which occur on more then one page are likely to be headlines
+    transform(pages:PdfPage[]) {
+
+        const heightToOccurrence = {};
+        pages.forEach(page => {
+            page.textItems.forEach(item => {
+                heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
+            });
+        });
+        console.debug(heightToOccurrence);
+        const mostUsedHeight = getMostUsedHeight(heightToOccurrence);
+        console.debug("mostUsedHeight: " + mostUsedHeight);
+
+        const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem)));
+        console.debug(Array.from(headlineHeights));
+        const headlineHeights2 = new Set();
+        pages.forEach(page => {
+            const textItems = page.textItems;
+            for (var i = 0; i < textItems.length; i++) {
+                const item = textItems[i];
+                if (item.height > mostUsedHeight) {
+
+                    item.annotation = ADDED_ANNOTATION;
+                    const firstItemOnPage = i == 0;
+                    var upperDistance = 99;
+                    if (!firstItemOnPage) {
+                        upperDistance = textItems[i - 1].y - item.y - item.height;
+                    }
+                    var lowerDistance = 0;
+                    const lastItemOnPage = i == textItems.length - 1;
+                    if (!lastItemOnPage) {
+                        lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height;
+                    }
+                    if (firstItemOnPage) {
+                        console.debug("add " + item.height);
+                        console.debug("potential headline: " + item.height + " | " + item.text);
+                        console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
+                        console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
+                        headlineHeights2.add(item.height);
+                    }
+
+                    // if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) {
+                    //     console.debug("remove " + item.height);
+                    //     console.debug("potential headline: " + item.height + " | " + item.text);
+                    //     console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
+                    //     console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
+                    //     headlineHeights.delete(item.height);
+                    // }
+
+
+                // if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) {
+                //     item.annotation = ADDED_ANNOTATION;
+                // }
+                // console.debug("potential headline: " + item.height + " | " + item.text);
+                // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
+                // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
+                }
+            }
+        });
+        console.debug(Array.from(headlineHeights2));
+
+        return pages;
+    }
+
+    processAnnotations(pages:PdfPage[]) {
+        pages.forEach(page => {
+            page.textItems.forEach(textItem => textItem.annotation = null)
+        });
+        return pages;
+    }
+
+}
--- a/src/javascript/models/transformations/old/HeadlineToUppercase.jsx
+++ b/src/javascript/models/transformations/old/HeadlineToUppercase.jsx
--- a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
+++ b/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
--- a/src/javascript/models/transformations/old/ToBlockSystem.jsx
+++ b/src/javascript/models/transformations/old/ToBlockSystem.jsx