[WIP] Add remove whitespace and detect links again

2025-06-24 03:21:26 +02:00 · 2017-03-18 08:56:08 +01:00 · 2017-03-18 08:56:08 +01:00 · 07e7fbb505
commit 07e7fbb505
parent 4600dc6ee7
8 changed files with 95 additions and 131 deletions
--- a/src/javascript/components/debug/TextItemTable.jsx
+++ b/src/javascript/components/debug/TextItemTable.jsx
@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
                                                                </div>
                                                                <div style={ { textAlign: 'center', color: 'orange' } }>
                                                                  { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
-                                                                  { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
+                                                                  { textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
                                                                </div>
                                                              </td>
                                                              <td>
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
 import CompactLines from './transformations/textitem/CompactLines.jsx';
 import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
 import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
+import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
 import DetectTOC from './transformations/textitem/DetectTOC.jsx'
 import DetectListItems from './transformations/textitem/DetectListItems.jsx'
 import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
 import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
 import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
 import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
-// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx'
 // import DetectFormats from './transformations/DetectFormats.jsx'
-// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
-// import DetectLinks from './transformations/DetectLinks.jsx'
-// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
 // import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
 import ToTextBlocks from './transformations/ToTextBlocks.jsx';
 import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -33,6 +30,7 @@ export default class AppState {
            new CompactLines(),
            new RemoveRepetitiveElements(),
            new VerticalToHorizontal(),
+            new PostprocessLines(),
            new DetectTOC(),
            new DetectListItems(),
            new DetectHeaders(),
@ -40,12 +38,8 @@ export default class AppState {
            new GatherBlocks(),
            new DetectCodeQuoteBlocks(),
            new DetectListLevels(),
-            // new DetectHeadlines(),

            // new DetectFormats(),
-            // new RemoveWhitespaces(),
-            // new DetectLinks(),
-            // new HeadlineDetector(),
            // new HeadlineToUppercase(),
            new ToTextBlocks(),
            new ToMarkdown()];
--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@ -15,13 +15,15 @@ export default class PageItem {
 export class ParsedElements {

    constructor(options) {
-        this.footnoteLinks = options.footnoteLinks;
-        this.footnotes = options.footnotes;
+        this.footnoteLinks = options.footnoteLinks || [];
+        this.footnotes = options.footnotes || [];
+        this.containLinks = options.containLinks;
    }

    add(parsedElements) {
        this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
        this.footnotes = this.footnotes.concat(parsedElements.footnotes);
+        this.containLinks = this.containLinks || parsedElements.containLinks;
    }

 }
--- a/src/javascript/models/TextItemLineCompactor.jsx
+++ b/src/javascript/models/TextItemLineCompactor.jsx
@ -51,10 +51,6 @@ export default class TextItemLineCompactor {
            });
        }
        combinedItem.parsedElements = parsedElements;
-
-        //TODO whitespace removal
-        //TODO bold/emphasis
-
        return combinedItem;
    }

@ -80,7 +76,7 @@ export default class TextItemLineCompactor {
                    //TODO womb comp [29] => ydiff == 0
                    newLineItems.push(new TextItem({
                        ...stashedNumberItems[0],
-                        text: `(^${ joinedNumber}):`
+                        text: `(^${ joinedNumber}): `
                    }));
                    footnotes.push(joinedNumber);
                } else {
--- a/src/javascript/models/transformations/old/DetectLinks.jsx
+++ b/src/javascript/models/transformations/old/DetectLinks.jsx
@ -1,54 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-export default class DetectLinks extends ToPdfViewTransformation {
-
-    constructor() {
-        super("Detect Links");
-    }
-
-    transform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            const newTextItems = [];
-            page.textItems.forEach(item => {
-                newTextItems.push(item);
-                var words = item.text.split(' ');
-                var changedWords = [];
-                var change = false;
-                words.forEach(word => {
-                    if (word.startsWith('http:')) {
-                        changedWords.push(`[${word}](${word})`);
-                        change = true;
-                    } else if (word.startsWith('www.')) {
-                        changedWords.push(`[http://${word}](http://${word})`);
-                        change = true;
-                    } else {
-                        changedWords.push(word);
-                    }
-                });
-                if (change) {
-                    newTextItems.push(new TextItem({
-                        ...item,
-                        text: changedWords.join(' '),
-                        annotation: ADDED_ANNOTATION,
-                    }));
-                    item.annotation = REMOVED_ANNOTATION;
-                }
-            });
-            page.textItems = newTextItems;
-        });
-        return parseResult;
-    }
-
-    completeTransform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
-            page.textItems.forEach(textItem => textItem.annotation = null)
-        });
-        return parseResult;
-    }
-
-}
--- a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
+++ b/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
@ -1,51 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-export default class RemoveWhitespaces extends ToPdfViewTransformation {
-
-    constructor() {
-        super("Remove Whitespaces");
-        this.showWhitespaces = true;
-    }
-
-    transform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            const newTextItems = [];
-            page.textItems.forEach(item => {
-                newTextItems.push(item);
-                var words = item.text.trim().split(' ');
-                var changedWords = [];
-                var change = false;
-                words.forEach(word => {
-                    if (word.length == 0) {
-                        change = true;
-                    } else {
-                        changedWords.push(word);
-                    }
-                });
-                if (change) {
-                    newTextItems.push(new TextItem({
-                        ...item,
-                        text: changedWords.join(' '),
-                        annotation: ADDED_ANNOTATION,
-                    }));
-                    item.annotation = REMOVED_ANNOTATION;
-                }
-            });
-            page.textItems = newTextItems;
-        });
-        return parseResult;
-    }
-
-    completeTransform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
-            page.textItems.forEach(textItem => textItem.annotation = null)
-        });
-        return parseResult;
-    }
-
-}
--- a/src/javascript/models/transformations/textitem/CompactLines.jsx
+++ b/src/javascript/models/transformations/textitem/CompactLines.jsx
@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
 export default class CompactLines extends ToTextItemTransformation {

    constructor() {
-        super("Compact Lines");
+        super("Compact To Lines");
    }

    transform(parseResult:ParseResult) {
@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation {
                const newItems = [];
                const textItemsGroupedByLine = lineGrouper.group(page.items);
                textItemsGroupedByLine.forEach(textItemsOfLine => {
+                    var lineItem;
                    if (textItemsOfLine.length == 1) {
-                        newItems.push(textItemsOfLine[0]);
+                        lineItem = textItemsOfLine[0];
                    } else {
                        textItemsOfLine.forEach(item => {
                            item.annotation = REMOVED_ANNOTATION;
                            newItems.push(item);
                        });

-                        const combinedItem = lineCompactor.compact(textItemsOfLine);
-                        combinedItem.annotation = ADDED_ANNOTATION;
-                        newItems.push(combinedItem);
+                        lineItem = lineCompactor.compact(textItemsOfLine);
+                        lineItem.annotation = ADDED_ANNOTATION;

-                        if (combinedItem.parsedElements.footnoteLinks.length > 0) {
-                            const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
+                        if (lineItem.parsedElements.footnoteLinks.length > 0) {
+                            const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
                            foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
                        }
-                        if (combinedItem.parsedElements.footnotes.length > 0) {
-                            combinedItem.type = ElementType.FOOTNOTES;
-                            const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
+                        if (lineItem.parsedElements.footnotes.length > 0) {
+                            lineItem.type = ElementType.FOOTNOTES;
+                            const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
                            foundFootnotes.push.apply(foundFootnotes, footnotes);
                        }
                    }
+                    lineItem.text = lineItem.text.trim();
+                    newItems.push(lineItem);
                });
                page.items = newItems;
            }
--- a/src/javascript/models/transformations/textitem/PostprocessLines.jsx
+++ b/src/javascript/models/transformations/textitem/PostprocessLines.jsx
@ -0,0 +1,75 @@
+import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ParseResult from '../../ParseResult.jsx';
+import TextItem from '../../TextItem.jsx';
+import { ParsedElements } from '../../PageItem.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
+
+
+// Remove whitespace, detect links, etc...
+export default class PostprocessLines extends ToTextItemTransformation {
+
+    constructor() {
+        super("Remove Whitespace & Detect Links");
+        this.showWhitespaces = true;
+    }
+
+    transform(parseResult:ParseResult) {
+        var strippedWhitespace = 0;
+        var foundLinks = 0;
+
+        parseResult.pages.forEach(page => {
+            const newItems = [];
+            page.items.forEach(lineItem => {
+                newItems.push(lineItem);
+                var words = lineItem.text.split(' ');
+                var newWords = [];
+                var foundSuperflousNewLine = false;
+                var foundLink = false;
+                words.forEach(word => {
+                    if (word.trim().length == 0) {
+                        foundSuperflousNewLine = true;
+                        strippedWhitespace++;
+                    } else {
+                        if (word.startsWith('http:')) {
+                            foundLinks++;
+                            foundLink = true;
+                            newWords.push(`[${word}](${word})`);
+                        } else if (word.startsWith('www.')) {
+                            foundLinks++;
+                            foundLink = true;
+                            newWords.push(`[http://${word}](http://${word})`);
+                        } else {
+                            newWords.push(word);
+                        }
+                    }
+                });
+                if (foundSuperflousNewLine || foundLink) {
+                    lineItem.annotation = REMOVED_ANNOTATION;
+                    if (newWords.length > 0) {
+                        newItems.push(new TextItem({
+                            ...lineItem,
+                            text: newWords.join(' '),
+                            annotation: ADDED_ANNOTATION,
+                            parsedElements: new ParsedElements({
+                                ...lineItem.parsedElements,
+                                containLinks: foundLink
+                            })
+                        }));
+                    }
+                }
+            });
+            page.items = newItems;
+        });
+
+
+        return new ParseResult({
+            ...parseResult,
+            messages: [
+                'Stripped ' + strippedWhitespace + ' superflous whitespaces',
+                'Found ' + foundLinks + ' links',
+            ]
+        });
+    }
+
+
+}