From 07e7fbb505c06d10881752d27e2a4270ca0cc175 Mon Sep 17 00:00:00 2001
From: Johannes Zillmann <jz@datameer.com>
Date: Sat, 18 Mar 2017 08:56:08 +0100
Subject: [PATCH] [WIP] Add remove whitespace and detect links again

---
 .../components/debug/TextItemTable.jsx        |  2 +-
 src/javascript/models/AppState.jsx            | 10 +--
 src/javascript/models/PageItem.jsx            |  6 +-
 .../models/TextItemLineCompactor.jsx          |  6 +-
 .../transformations/old/DetectLinks.jsx       | 54 -------------
 .../transformations/old/RemoveWhitespaces.jsx | 51 -------------
 .../transformations/textitem/CompactLines.jsx | 22 +++---
 .../textitem/PostprocessLines.jsx             | 75 +++++++++++++++++++
 8 files changed, 95 insertions(+), 131 deletions(-)
 delete mode 100644 src/javascript/models/transformations/old/DetectLinks.jsx
 delete mode 100644 src/javascript/models/transformations/old/RemoveWhitespaces.jsx
 create mode 100644 src/javascript/models/transformations/textitem/PostprocessLines.jsx
diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx
index 91b0ac7..07c6712 100644
--- a/src/javascript/components/debug/TextItemTable.jsx
+++ b/src/javascript/components/debug/TextItemTable.jsx
@@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
                                                                 </div>
                                                                 <div style={ { textAlign: 'center', color: 'orange' } }>
                                                                   { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
-                                                                  { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
+                                                                  { textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
                                                                 </div>
                                                               </td>
                                                               <td>
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index dc83e4d..a18d8c5 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
 import CompactLines from './transformations/textitem/CompactLines.jsx';
 import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
 import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
+import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
 import DetectTOC from './transformations/textitem/DetectTOC.jsx'
 import DetectListItems from './transformations/textitem/DetectListItems.jsx'
 import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
@@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
 import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
 import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
 import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
-// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx'
 // import DetectFormats from './transformations/DetectFormats.jsx'
-// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
-// import DetectLinks from './transformations/DetectLinks.jsx'
-// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
 // import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
 import ToTextBlocks from './transformations/ToTextBlocks.jsx';
 import ToMarkdown from './transformations/ToMarkdown.jsx'
@@ -33,6 +30,7 @@ export default class AppState {
             new CompactLines(),
             new RemoveRepetitiveElements(),
             new VerticalToHorizontal(),
+            new PostprocessLines(),
             new DetectTOC(),
             new DetectListItems(),
             new DetectHeaders(),
@@ -40,12 +38,8 @@ export default class AppState {
             new GatherBlocks(),
             new DetectCodeQuoteBlocks(),
             new DetectListLevels(),
-            // new DetectHeadlines(),
 
             // new DetectFormats(),
-            // new RemoveWhitespaces(),
-            // new DetectLinks(),
-            // new HeadlineDetector(),
             // new HeadlineToUppercase(),
             new ToTextBlocks(),
             new ToMarkdown()];
diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx
index 27c178e..080fb02 100644
--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@@ -15,13 +15,15 @@ export default class PageItem {
 export class ParsedElements {
 
     constructor(options) {
-        this.footnoteLinks = options.footnoteLinks;
-        this.footnotes = options.footnotes;
+        this.footnoteLinks = options.footnoteLinks || [];
+        this.footnotes = options.footnotes || [];
+        this.containLinks = options.containLinks;
     }
 
     add(parsedElements) {
         this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
         this.footnotes = this.footnotes.concat(parsedElements.footnotes);
+        this.containLinks = this.containLinks || parsedElements.containLinks;
     }
 
 }
\ No newline at end of file
diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx
index 74c69b8..68b0bd7 100644
--- a/src/javascript/models/TextItemLineCompactor.jsx
+++ b/src/javascript/models/TextItemLineCompactor.jsx
@@ -51,10 +51,6 @@ export default class TextItemLineCompactor {
             });
         }
         combinedItem.parsedElements = parsedElements;
-
-        //TODO whitespace removal
-        //TODO bold/emphasis
-
         return combinedItem;
     }
 
@@ -80,7 +76,7 @@ export default class TextItemLineCompactor {
                     //TODO womb comp [29] => ydiff == 0
                     newLineItems.push(new TextItem({
                         ...stashedNumberItems[0],
-                        text: `(^${ joinedNumber}):`
+                        text: `(^${ joinedNumber}): `
                     }));
                     footnotes.push(joinedNumber);
                 } else {
diff --git a/src/javascript/models/transformations/old/DetectLinks.jsx b/src/javascript/models/transformations/old/DetectLinks.jsx
deleted file mode 100644
index a05b5cf..0000000
--- a/src/javascript/models/transformations/old/DetectLinks.jsx
+++ /dev/null
@@ -1,54 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-export default class DetectLinks extends ToPdfViewTransformation {
-
-    constructor() {
-        super("Detect Links");
-    }
-
-    transform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            const newTextItems = [];
-            page.textItems.forEach(item => {
-                newTextItems.push(item);
-                var words = item.text.split(' ');
-                var changedWords = [];
-                var change = false;
-                words.forEach(word => {
-                    if (word.startsWith('http:')) {
-                        changedWords.push(`[${word}](${word})`);
-                        change = true;
-                    } else if (word.startsWith('www.')) {
-                        changedWords.push(`[http://${word}](http://${word})`);
-                        change = true;
-                    } else {
-                        changedWords.push(word);
-                    }
-                });
-                if (change) {
-                    newTextItems.push(new TextItem({
-                        ...item,
-                        text: changedWords.join(' '),
-                        annotation: ADDED_ANNOTATION,
-                    }));
-                    item.annotation = REMOVED_ANNOTATION;
-                }
-            });
-            page.textItems = newTextItems;
-        });
-        return parseResult;
-    }
-
-    completeTransform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
-            page.textItems.forEach(textItem => textItem.annotation = null)
-        });
-        return parseResult;
-    }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx b/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
deleted file mode 100644
index 8c91c3c..0000000
--- a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
+++ /dev/null
@@ -1,51 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-export default class RemoveWhitespaces extends ToPdfViewTransformation {
-
-    constructor() {
-        super("Remove Whitespaces");
-        this.showWhitespaces = true;
-    }
-
-    transform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            const newTextItems = [];
-            page.textItems.forEach(item => {
-                newTextItems.push(item);
-                var words = item.text.trim().split(' ');
-                var changedWords = [];
-                var change = false;
-                words.forEach(word => {
-                    if (word.length == 0) {
-                        change = true;
-                    } else {
-                        changedWords.push(word);
-                    }
-                });
-                if (change) {
-                    newTextItems.push(new TextItem({
-                        ...item,
-                        text: changedWords.join(' '),
-                        annotation: ADDED_ANNOTATION,
-                    }));
-                    item.annotation = REMOVED_ANNOTATION;
-                }
-            });
-            page.textItems = newTextItems;
-        });
-        return parseResult;
-    }
-
-    completeTransform(parseResult:ParseResult) {
-        parseResult.content.forEach(page => {
-            page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
-            page.textItems.forEach(textItem => textItem.annotation = null)
-        });
-        return parseResult;
-    }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx
index 3cc89ad..890e2ee 100644
--- a/src/javascript/models/transformations/textitem/CompactLines.jsx
+++ b/src/javascript/models/transformations/textitem/CompactLines.jsx
@@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
 export default class CompactLines extends ToTextItemTransformation {
 
     constructor() {
-        super("Compact Lines");
+        super("Compact To Lines");
     }
 
     transform(parseResult:ParseResult) {
@@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation {
                 const newItems = [];
                 const textItemsGroupedByLine = lineGrouper.group(page.items);
                 textItemsGroupedByLine.forEach(textItemsOfLine => {
+                    var lineItem;
                     if (textItemsOfLine.length == 1) {
-                        newItems.push(textItemsOfLine[0]);
+                        lineItem = textItemsOfLine[0];
                     } else {
                         textItemsOfLine.forEach(item => {
                             item.annotation = REMOVED_ANNOTATION;
                             newItems.push(item);
                         });
 
-                        const combinedItem = lineCompactor.compact(textItemsOfLine);
-                        combinedItem.annotation = ADDED_ANNOTATION;
-                        newItems.push(combinedItem);
+                        lineItem = lineCompactor.compact(textItemsOfLine);
+                        lineItem.annotation = ADDED_ANNOTATION;
 
-                        if (combinedItem.parsedElements.footnoteLinks.length > 0) {
-                            const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
+                        if (lineItem.parsedElements.footnoteLinks.length > 0) {
+                            const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
                             foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
                         }
-                        if (combinedItem.parsedElements.footnotes.length > 0) {
-                            combinedItem.type = ElementType.FOOTNOTES;
-                            const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
+                        if (lineItem.parsedElements.footnotes.length > 0) {
+                            lineItem.type = ElementType.FOOTNOTES;
+                            const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
                             foundFootnotes.push.apply(foundFootnotes, footnotes);
                         }
                     }
+                    lineItem.text = lineItem.text.trim();
+                    newItems.push(lineItem);
                 });
                 page.items = newItems;
             }
diff --git a/src/javascript/models/transformations/textitem/PostprocessLines.jsx b/src/javascript/models/transformations/textitem/PostprocessLines.jsx
new file mode 100644
index 0000000..4ec2c03
--- /dev/null
+++ b/src/javascript/models/transformations/textitem/PostprocessLines.jsx
@@ -0,0 +1,75 @@
+import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ParseResult from '../../ParseResult.jsx';
+import TextItem from '../../TextItem.jsx';
+import { ParsedElements } from '../../PageItem.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
+
+
+// Remove whitespace, detect links, etc...
+export default class PostprocessLines extends ToTextItemTransformation {
+
+    constructor() {
+        super("Remove Whitespace & Detect Links");
+        this.showWhitespaces = true;
+    }
+
+    transform(parseResult:ParseResult) {
+        var strippedWhitespace = 0;
+        var foundLinks = 0;
+
+        parseResult.pages.forEach(page => {
+            const newItems = [];
+            page.items.forEach(lineItem => {
+                newItems.push(lineItem);
+                var words = lineItem.text.split(' ');
+                var newWords = [];
+                var foundSuperflousNewLine = false;
+                var foundLink = false;
+                words.forEach(word => {
+                    if (word.trim().length == 0) {
+                        foundSuperflousNewLine = true;
+                        strippedWhitespace++;
+                    } else {
+                        if (word.startsWith('http:')) {
+                            foundLinks++;
+                            foundLink = true;
+                            newWords.push(`[${word}](${word})`);
+                        } else if (word.startsWith('www.')) {
+                            foundLinks++;
+                            foundLink = true;
+                            newWords.push(`[http://${word}](http://${word})`);
+                        } else {
+                            newWords.push(word);
+                        }
+                    }
+                });
+                if (foundSuperflousNewLine || foundLink) {
+                    lineItem.annotation = REMOVED_ANNOTATION;
+                    if (newWords.length > 0) {
+                        newItems.push(new TextItem({
+                            ...lineItem,
+                            text: newWords.join(' '),
+                            annotation: ADDED_ANNOTATION,
+                            parsedElements: new ParsedElements({
+                                ...lineItem.parsedElements,
+                                containLinks: foundLink
+                            })
+                        }));
+                    }
+                }
+            });
+            page.items = newItems;
+        });
+
+
+        return new ParseResult({
+            ...parseResult,
+            messages: [
+                'Stripped ' + strippedWhitespace + ' superflous whitespaces',
+                'Found ' + foundLinks + ' links',
+            ]
+        });
+    }
+
+
+}