Detect Links & Remove Whitespaces

2025-06-25 03:51:33 +02:00 · 2017-02-11 15:23:01 +01:00 · 2017-02-11 15:23:01 +01:00 · 996e5fae62
commit 996e5fae62
parent fc0aafebdd
3 changed files with 136 additions and 0 deletions
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -3,7 +3,9 @@ import { Enum } from 'enumify';
 import NoOp from './transformations/NoOp.jsx';
 import RoundCoordinates from './transformations/RoundCoordinates.jsx';
 import CombineSameY from './transformations/CombineSameY.jsx';
 import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
 import DetectFootnotes from './transformations/DetectFootnotes.jsx'
 import DetectLinks from './transformations/DetectLinks.jsx'
 import RemoveRepetitiveElements from './transformations/RemoveRepetitiveElements.jsx'
 import HeadlineDetector from './transformations/HeadlineDetector.jsx'
 import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
@ -22,7 +24,9 @@ export default class AppState {
            new NoOp,
            new RoundCoordinates(),
            new CombineSameY(),
            new RemoveWhitespaces(),
            new DetectFootnotes(),
            new DetectLinks(),
            new RemoveRepetitiveElements(),
            new HeadlineDetector(),
            new HeadlineToUppercase(),
--- a/src/javascript/models/transformations/DetectLinks.jsx
+++ b/src/javascript/models/transformations/DetectLinks.jsx
@ -0,0 +1,68 @@
 import Transformation from './Transformation.jsx';
 import TextItem from '../TextItem.jsx';
 import PdfPage from '../PdfPage.jsx';
 import ContentView from '../ContentView.jsx';
 import Annotation from '../Annotation.jsx';
 export default class DetectLinks extends Transformation {
    constructor() {
        super("Detect Links");
    }
    contentView() {
        return ContentView.PDF;
    }
    transform(pages:PdfPage[]) {
        const addedAnnotation = new Annotation({
            category: 'added',
            color: 'green'
        });
        const removedAnnotation = new Annotation({
            category: 'removed',
            color: 'red'
        });
        pages.forEach(page => {
            const newTextItems = [];
            page.textItems.forEach(item => {
                newTextItems.push(item);
                var words = item.text.split(' ');
                var changedWords = [];
                var change = false;
                words.forEach(word => {
                    if (word.startsWith('http:')) {
                        changedWords.push(`[${word}](${word})`);
                        change = true;
                    } else if (word.startsWith('www.')) {
                        changedWords.push(`[http://${word}](http://${word})`);
                        change = true;
                    } else {
                        changedWords.push(word);
                    }
                });
                if (change) {
                    newTextItems.push(new TextItem({
                        ...item,
                        text: changedWords.join(' '),
                        annotation: addedAnnotation,
                    }));
                    item.annotation = removedAnnotation;
                }
            });
            page.textItems = newTextItems;
        });
        return pages;
    }
    processAnnotations(pages:PdfPage[]) {
        pages.forEach(page => {
            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
            page.textItems.forEach(textItem => textItem.annotation = null)
        });
        return pages;
    }
 }
--- a/src/javascript/models/transformations/RemoveWhitespaces.jsx
+++ b/src/javascript/models/transformations/RemoveWhitespaces.jsx
@ -0,0 +1,64 @@
 import Transformation from './Transformation.jsx';
 import TextItem from '../TextItem.jsx';
 import PdfPage from '../PdfPage.jsx';
 import ContentView from '../ContentView.jsx';
 import Annotation from '../Annotation.jsx';
 export default class RemoveWhitespaces extends Transformation {
    constructor() {
        super("Remove Whitespaces");
    }
    contentView() {
        return ContentView.PDF;
    }
    transform(pages:PdfPage[]) {
        const addedAnnotation = new Annotation({
            category: 'added',
            color: 'green'
        });
        const removedAnnotation = new Annotation({
            category: 'removed',
            color: 'red'
        });
        pages.forEach(page => {
            const newTextItems = [];
            page.textItems.forEach(item => {
                newTextItems.push(item);
                var words = item.text.trim().split(' ');
                var changedWords = [];
                var change = false;
                words.forEach(word => {
                    if (word.length == 0) {
                        change = true;
                    } else {
                        changedWords.push(word);
                    }
                });
                if (change) {
                    newTextItems.push(new TextItem({
                        ...item,
                        text: changedWords.join(' '),
                        annotation: addedAnnotation,
                    }));
                    item.annotation = removedAnnotation;
                }
            });
            page.textItems = newTextItems;
        });
        return pages;
    }
    processAnnotations(pages:PdfPage[]) {
        pages.forEach(page => {
            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation.category !== 'removed');
            page.textItems.forEach(textItem => textItem.annotation = null)
        });
        return pages;
    }
 }