[WIP] move unused stuff in separate folder

2025-06-25 03:51:33 +02:00 · 2017-03-10 06:30:18 +01:00 · 2017-03-10 06:30:18 +01:00 · e2ddf0312b
commit e2ddf0312b
parent 111124fbf3
10 changed files with 205 additions and 0 deletions
--- a/src/javascript/models/HeaderLevelAssigner.jsx
+++ b/src/javascript/models/HeaderLevelAssigner.jsx
@ -0,0 +1,28 @@
 // Input is a flow of heights which are potential headers, output are header level for each height or the judgement, that it is no header
 // Levels are from 1..6, where 1 is the biggest headline.
 // HeaderLevelAssigner is use with an start level. If the start level is 2, then the first headline will be of level 2 and there will be no level 1 given.
 export default class HeaderLevelAssigner {
    constructor(options) {
        this.startLevel = options.startLevel;
        this.paragraphHeight = options.paragraphHeight;
        this.lastLevel = null;
        this.lastHeight = null;
        this.heightToLevel = {};
    }
    add(height) {
        if (!this.lastHeight) {
            this.lastLevel = this.startLevel;
            this.heightToLevel[height] = this.startLevel;
        } else {
            const existingLevel = this.heightToLevel[height];
            if (!existingLevel) {
                //
            }
        }
        this.lastHeight = height;
    }
 }
--- a/src/javascript/models/transformations/old/CombineSameY.jsx
+++ b/src/javascript/models/transformations/old/CombineSameY.jsx
--- a/src/javascript/models/transformations/old/DetectFootnoteOld.jsx
+++ b/src/javascript/models/transformations/old/DetectFootnoteOld.jsx
@ -0,0 +1,70 @@
 import ToTextItemBlockTransformation from './ToTextItemBlockTransformation.jsx';
 import TextItem from '../TextItem.jsx';
 import ParseResult from '../ParseResult.jsx';
 import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
 import { isNumber } from '../../functions.jsx'
 export default class DetectFootnoteOld extends ToTextItemBlockTransformation {
    constructor() {
        super("Detect Footnote ");
    }
    transform(parseResult:ParseResult) {
        var nextFooterNumber = 1;
        var potentialFootnoteItem;
        var foundFootnotes = 0;
        const newContent = parseResult.content.map(page => {
            const newTextItems = [];
            for (var i = 0; i < page.textItems.length; i++) {
                const item = page.textItems[i];
                if (potentialFootnoteItem) {
                    if (potentialFootnoteItem.y - item.y < item.height) {
                        potentialFootnoteItem.annotation = REMOVED_ANNOTATION;
                        item.annotation = REMOVED_ANNOTATION;
                        newTextItems.push(potentialFootnoteItem);
                        newTextItems.push(item);
                        newTextItems.push(new TextItem({
                            x: potentialFootnoteItem.x,
                            y: item.y,
                            width: potentialFootnoteItem.width + item.width,
                            height: item.height,
                            text: '[' + potentialFootnoteItem.text + '] ' + item.text,
                            annotation: ADDED_ANNOTATION
                        }));
                        //TODO repsect multiline!!
                        nextFooterNumber++;
                        foundFootnotes++;
                    }
                    potentialFootnoteItem = null;
                } else if (isNumber(item.text) && parseInt(item.text) == nextFooterNumber && i > 0 && i < page.textItems.length - 1 && page.textItems[i - 1].y !== page.textItems[i + 1].y) {
                    potentialFootnoteItem = item;
                } else {
                    newTextItems.push(item);
                }
            }
            return {
                ...page,
                textItems: newTextItems
            };
        });
        return new ParseResult({
            ...parseResult,
            content: newContent,
            messages: ['Detected ' + foundFootnotes + ' footnotes']
        });
    }
    completeTransform(parseResult:ParseResult) {
        parseResult.content.forEach(page => {
            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
            page.textItems.forEach(textItem => textItem.annotation = null)
        });
        return parseResult;
    }
 }
--- a/src/javascript/models/transformations/old/DetectFormats.jsx
+++ b/src/javascript/models/transformations/old/DetectFormats.jsx
--- a/src/javascript/models/transformations/old/DetectLinks.jsx
+++ b/src/javascript/models/transformations/old/DetectLinks.jsx
--- a/src/javascript/models/transformations/old/HeadlineDetector.jsx
+++ b/src/javascript/models/transformations/old/HeadlineDetector.jsx
--- a/src/javascript/models/transformations/old/HeadlineDetector2.jsx
+++ b/src/javascript/models/transformations/old/HeadlineDetector2.jsx
@ -0,0 +1,107 @@
 import Transformation from './Transformation.jsx';
 import TextItem from '../TextItem.jsx';
 import PdfPage from '../PdfPage.jsx';
 import ContentView from '../ContentView.jsx';
 import { Annotation, ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
 import Headline from '../markdown/Headline.jsx';
 function getMostUsedHeight(heightToOccurrence) {
    var maxOccurence = 0;
    var maxHeight = 0;
    Object.keys(heightToOccurrence).map((element) => {
        if (heightToOccurrence[element] > maxOccurence) {
            maxOccurence = heightToOccurrence[element];
            maxHeight = element;
        }
    });
    return parseInt(maxHeight);
 }
 export default class HeadlineDetector extends Transformation {
    constructor() {
        super("Detect Headlines");
    }
    contentView() {
        return ContentView.PDF;
    }
    // Strategy:
    // - find most used height => this & every height below is paragraph
    // - heights which start a page are likely to be headlines
    // - maxHeigth is likely a headline
    // - heights which occur on more then one page are likely to be headlines
    transform(pages:PdfPage[]) {
        const heightToOccurrence = {};
        pages.forEach(page => {
            page.textItems.forEach(item => {
                heightToOccurrence[item.height] = heightToOccurrence[item.height] ? heightToOccurrence[item.height] + 1 : 1;
            });
        });
        console.debug(heightToOccurrence);
        const mostUsedHeight = getMostUsedHeight(heightToOccurrence);
        console.debug("mostUsedHeight: " + mostUsedHeight);
        const headlineHeights = new Set(Object.keys(heightToOccurrence).filter(height => parseInt(height) > mostUsedHeight).map(elem => parseInt(elem)));
        console.debug(Array.from(headlineHeights));
        const headlineHeights2 = new Set();
        pages.forEach(page => {
            const textItems = page.textItems;
            for (var i = 0; i < textItems.length; i++) {
                const item = textItems[i];
                if (item.height > mostUsedHeight) {
                    item.annotation = ADDED_ANNOTATION;
                    const firstItemOnPage = i == 0;
                    var upperDistance = 99;
                    if (!firstItemOnPage) {
                        upperDistance = textItems[i - 1].y - item.y - item.height;
                    }
                    var lowerDistance = 0;
                    const lastItemOnPage = i == textItems.length - 1;
                    if (!lastItemOnPage) {
                        lowerDistance = item.y - textItems[i + 1].y - textItems[i + 1].height;
                    }
                    if (firstItemOnPage) {
                        console.debug("add " + item.height);
                        console.debug("potential headline: " + item.height + " | " + item.text);
                        console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
                        console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
                        headlineHeights2.add(item.height);
                    }
                    // if (!((firstItemOnPage || upperDistance > mostUsedHeight / 2) && lowerDistance > mostUsedHeight / 2)) {
                    //     console.debug("remove " + item.height);
                    //     console.debug("potential headline: " + item.height + " | " + item.text);
                    //     console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
                    //     console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
                    //     headlineHeights.delete(item.height);
                    // }
                // if ((firstItemOnPage || upperDistance > 10) && lowerDistance > 10) {
                //     item.annotation = ADDED_ANNOTATION;
                // }
                // console.debug("potential headline: " + item.height + " | " + item.text);
                // console.debug("\tfirstItem=" + firstItemOnPage + ", lastItem:" + lastItemOnPage);
                // console.debug("\tupperDistance/lowerDistance=" + upperDistance + " / " + lowerDistance);
                }
            }
        });
        console.debug(Array.from(headlineHeights2));
        return pages;
    }
    processAnnotations(pages:PdfPage[]) {
        pages.forEach(page => {
            page.textItems.forEach(textItem => textItem.annotation = null)
        });
        return pages;
    }
 }
--- a/src/javascript/models/transformations/old/HeadlineToUppercase.jsx
+++ b/src/javascript/models/transformations/old/HeadlineToUppercase.jsx
--- a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
+++ b/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
--- a/src/javascript/models/transformations/old/ToBlockSystem.jsx
+++ b/src/javascript/models/transformations/old/ToBlockSystem.jsx