diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx
index 91b0ac7..07c6712 100644
--- a/src/javascript/components/debug/TextItemTable.jsx
+++ b/src/javascript/components/debug/TextItemTable.jsx
@@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
- { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' }
+ { textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx
index dc83e4d..a18d8c5 100644
--- a/src/javascript/models/AppState.jsx
+++ b/src/javascript/models/AppState.jsx
@@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
import CompactLines from './transformations/textitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
+import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
@@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
-// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx'
-// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
-// import DetectLinks from './transformations/DetectLinks.jsx'
-// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx'
@@ -33,6 +30,7 @@ export default class AppState {
new CompactLines(),
new RemoveRepetitiveElements(),
new VerticalToHorizontal(),
+ new PostprocessLines(),
new DetectTOC(),
new DetectListItems(),
new DetectHeaders(),
@@ -40,12 +38,8 @@ export default class AppState {
new GatherBlocks(),
new DetectCodeQuoteBlocks(),
new DetectListLevels(),
- // new DetectHeadlines(),
// new DetectFormats(),
- // new RemoveWhitespaces(),
- // new DetectLinks(),
- // new HeadlineDetector(),
// new HeadlineToUppercase(),
new ToTextBlocks(),
new ToMarkdown()];
diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx
index 27c178e..080fb02 100644
--- a/src/javascript/models/PageItem.jsx
+++ b/src/javascript/models/PageItem.jsx
@@ -15,13 +15,15 @@ export default class PageItem {
export class ParsedElements {
constructor(options) {
- this.footnoteLinks = options.footnoteLinks;
- this.footnotes = options.footnotes;
+ this.footnoteLinks = options.footnoteLinks || [];
+ this.footnotes = options.footnotes || [];
+ this.containLinks = options.containLinks;
}
add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes);
+ this.containLinks = this.containLinks || parsedElements.containLinks;
}
}
\ No newline at end of file
diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx
index 74c69b8..68b0bd7 100644
--- a/src/javascript/models/TextItemLineCompactor.jsx
+++ b/src/javascript/models/TextItemLineCompactor.jsx
@@ -51,10 +51,6 @@ export default class TextItemLineCompactor {
});
}
combinedItem.parsedElements = parsedElements;
-
- //TODO whitespace removal
- //TODO bold/emphasis
-
return combinedItem;
}
@@ -80,7 +76,7 @@ export default class TextItemLineCompactor {
//TODO womb comp [29] => ydiff == 0
newLineItems.push(new TextItem({
...stashedNumberItems[0],
- text: `(^${ joinedNumber}):`
+ text: `(^${ joinedNumber}): `
}));
footnotes.push(joinedNumber);
} else {
diff --git a/src/javascript/models/transformations/old/DetectLinks.jsx b/src/javascript/models/transformations/old/DetectLinks.jsx
deleted file mode 100644
index a05b5cf..0000000
--- a/src/javascript/models/transformations/old/DetectLinks.jsx
+++ /dev/null
@@ -1,54 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-export default class DetectLinks extends ToPdfViewTransformation {
-
- constructor() {
- super("Detect Links");
- }
-
- transform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- const newTextItems = [];
- page.textItems.forEach(item => {
- newTextItems.push(item);
- var words = item.text.split(' ');
- var changedWords = [];
- var change = false;
- words.forEach(word => {
- if (word.startsWith('http:')) {
- changedWords.push(`[${word}](${word})`);
- change = true;
- } else if (word.startsWith('www.')) {
- changedWords.push(`[http://${word}](http://${word})`);
- change = true;
- } else {
- changedWords.push(word);
- }
- });
- if (change) {
- newTextItems.push(new TextItem({
- ...item,
- text: changedWords.join(' '),
- annotation: ADDED_ANNOTATION,
- }));
- item.annotation = REMOVED_ANNOTATION;
- }
- });
- page.textItems = newTextItems;
- });
- return parseResult;
- }
-
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx b/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
deleted file mode 100644
index 8c91c3c..0000000
--- a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx
+++ /dev/null
@@ -1,51 +0,0 @@
-import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
-import TextItem from '../TextItem.jsx';
-import ParseResult from '../ParseResult.jsx';
-
-import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
-
-export default class RemoveWhitespaces extends ToPdfViewTransformation {
-
- constructor() {
- super("Remove Whitespaces");
- this.showWhitespaces = true;
- }
-
- transform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- const newTextItems = [];
- page.textItems.forEach(item => {
- newTextItems.push(item);
- var words = item.text.trim().split(' ');
- var changedWords = [];
- var change = false;
- words.forEach(word => {
- if (word.length == 0) {
- change = true;
- } else {
- changedWords.push(word);
- }
- });
- if (change) {
- newTextItems.push(new TextItem({
- ...item,
- text: changedWords.join(' '),
- annotation: ADDED_ANNOTATION,
- }));
- item.annotation = REMOVED_ANNOTATION;
- }
- });
- page.textItems = newTextItems;
- });
- return parseResult;
- }
-
- completeTransform(parseResult:ParseResult) {
- parseResult.content.forEach(page => {
- page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
- page.textItems.forEach(textItem => textItem.annotation = null)
- });
- return parseResult;
- }
-
-}
\ No newline at end of file
diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx
index 3cc89ad..890e2ee 100644
--- a/src/javascript/models/transformations/textitem/CompactLines.jsx
+++ b/src/javascript/models/transformations/textitem/CompactLines.jsx
@@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
export default class CompactLines extends ToTextItemTransformation {
constructor() {
- super("Compact Lines");
+ super("Compact To Lines");
}
transform(parseResult:ParseResult) {
@@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation {
const newItems = [];
const textItemsGroupedByLine = lineGrouper.group(page.items);
textItemsGroupedByLine.forEach(textItemsOfLine => {
+ var lineItem;
if (textItemsOfLine.length == 1) {
- newItems.push(textItemsOfLine[0]);
+ lineItem = textItemsOfLine[0];
} else {
textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION;
newItems.push(item);
});
- const combinedItem = lineCompactor.compact(textItemsOfLine);
- combinedItem.annotation = ADDED_ANNOTATION;
- newItems.push(combinedItem);
+ lineItem = lineCompactor.compact(textItemsOfLine);
+ lineItem.annotation = ADDED_ANNOTATION;
- if (combinedItem.parsedElements.footnoteLinks.length > 0) {
- const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },);
+ if (lineItem.parsedElements.footnoteLinks.length > 0) {
+ const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },);
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
}
- if (combinedItem.parsedElements.footnotes.length > 0) {
- combinedItem.type = ElementType.FOOTNOTES;
- const footnotes = combinedItem.parsedElements.footnotes.map(footnote => { footnote },);
+ if (lineItem.parsedElements.footnotes.length > 0) {
+ lineItem.type = ElementType.FOOTNOTES;
+ const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },);
foundFootnotes.push.apply(foundFootnotes, footnotes);
}
}
+ lineItem.text = lineItem.text.trim();
+ newItems.push(lineItem);
});
page.items = newItems;
}
diff --git a/src/javascript/models/transformations/textitem/PostprocessLines.jsx b/src/javascript/models/transformations/textitem/PostprocessLines.jsx
new file mode 100644
index 0000000..4ec2c03
--- /dev/null
+++ b/src/javascript/models/transformations/textitem/PostprocessLines.jsx
@@ -0,0 +1,75 @@
+import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
+import ParseResult from '../../ParseResult.jsx';
+import TextItem from '../../TextItem.jsx';
+import { ParsedElements } from '../../PageItem.jsx';
+import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
+
+
+// Remove whitespace, detect links, etc...
+export default class PostprocessLines extends ToTextItemTransformation {
+
+ constructor() {
+ super("Remove Whitespace & Detect Links");
+ this.showWhitespaces = true;
+ }
+
+ transform(parseResult:ParseResult) {
+ var strippedWhitespace = 0;
+ var foundLinks = 0;
+
+ parseResult.pages.forEach(page => {
+ const newItems = [];
+ page.items.forEach(lineItem => {
+ newItems.push(lineItem);
+ var words = lineItem.text.split(' ');
+ var newWords = [];
+ var foundSuperflousNewLine = false;
+ var foundLink = false;
+ words.forEach(word => {
+ if (word.trim().length == 0) {
+ foundSuperflousNewLine = true;
+ strippedWhitespace++;
+ } else {
+ if (word.startsWith('http:')) {
+ foundLinks++;
+ foundLink = true;
+ newWords.push(`[${word}](${word})`);
+ } else if (word.startsWith('www.')) {
+ foundLinks++;
+ foundLink = true;
+ newWords.push(`[http://${word}](http://${word})`);
+ } else {
+ newWords.push(word);
+ }
+ }
+ });
+ if (foundSuperflousNewLine || foundLink) {
+ lineItem.annotation = REMOVED_ANNOTATION;
+ if (newWords.length > 0) {
+ newItems.push(new TextItem({
+ ...lineItem,
+ text: newWords.join(' '),
+ annotation: ADDED_ANNOTATION,
+ parsedElements: new ParsedElements({
+ ...lineItem.parsedElements,
+ containLinks: foundLink
+ })
+ }));
+ }
+ }
+ });
+ page.items = newItems;
+ });
+
+
+ return new ParseResult({
+ ...parseResult,
+ messages: [
+ 'Stripped ' + strippedWhitespace + ' superflous whitespaces',
+ 'Found ' + foundLinks + ' links',
+ ]
+ });
+ }
+
+
+}
|