From 07e7fbb505c06d10881752d27e2a4270ca0cc175 Mon Sep 17 00:00:00 2001 From: Johannes Zillmann Date: Sat, 18 Mar 2017 08:56:08 +0100 Subject: [PATCH] [WIP] Add remove whitespace and detect links again --- .../components/debug/TextItemTable.jsx | 2 +- src/javascript/models/AppState.jsx | 10 +-- src/javascript/models/PageItem.jsx | 6 +- .../models/TextItemLineCompactor.jsx | 6 +- .../transformations/old/DetectLinks.jsx | 54 ------------- .../transformations/old/RemoveWhitespaces.jsx | 51 ------------- .../transformations/textitem/CompactLines.jsx | 22 +++--- .../textitem/PostprocessLines.jsx | 75 +++++++++++++++++++ 8 files changed, 95 insertions(+), 131 deletions(-) delete mode 100644 src/javascript/models/transformations/old/DetectLinks.jsx delete mode 100644 src/javascript/models/transformations/old/RemoveWhitespaces.jsx create mode 100644 src/javascript/models/transformations/textitem/PostprocessLines.jsx diff --git a/src/javascript/components/debug/TextItemTable.jsx b/src/javascript/components/debug/TextItemTable.jsx index 91b0ac7..07c6712 100644 --- a/src/javascript/components/debug/TextItemTable.jsx +++ b/src/javascript/components/debug/TextItemTable.jsx @@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } - { textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' } + { textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
diff --git a/src/javascript/models/AppState.jsx b/src/javascript/models/AppState.jsx index dc83e4d..a18d8c5 100644 --- a/src/javascript/models/AppState.jsx +++ b/src/javascript/models/AppState.jsx @@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat import CompactLines from './transformations/textitem/CompactLines.jsx'; import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx' import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; +import PostprocessLines from './transformations/textitem/PostprocessLines.jsx'; import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' @@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx' -// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx' // import DetectFormats from './transformations/DetectFormats.jsx' -// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx' -// import DetectLinks from './transformations/DetectLinks.jsx' -// import HeadlineDetector from './transformations/HeadlineDetector.jsx' // import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' import ToTextBlocks from './transformations/ToTextBlocks.jsx'; import ToMarkdown from './transformations/ToMarkdown.jsx' @@ -33,6 +30,7 @@ export default class AppState { new CompactLines(), new RemoveRepetitiveElements(), new VerticalToHorizontal(), + new PostprocessLines(), new DetectTOC(), new DetectListItems(), new DetectHeaders(), @@ -40,12 +38,8 @@ export default class AppState { new GatherBlocks(), new DetectCodeQuoteBlocks(), new DetectListLevels(), - // new DetectHeadlines(), // new DetectFormats(), - // new RemoveWhitespaces(), - // new DetectLinks(), - // new HeadlineDetector(), // new HeadlineToUppercase(), new ToTextBlocks(), new ToMarkdown()]; diff --git a/src/javascript/models/PageItem.jsx b/src/javascript/models/PageItem.jsx index 27c178e..080fb02 100644 --- a/src/javascript/models/PageItem.jsx +++ b/src/javascript/models/PageItem.jsx @@ -15,13 +15,15 @@ export default class PageItem { export class ParsedElements { constructor(options) { - this.footnoteLinks = options.footnoteLinks; - this.footnotes = options.footnotes; + this.footnoteLinks = options.footnoteLinks || []; + this.footnotes = options.footnotes || []; + this.containLinks = options.containLinks; } add(parsedElements) { this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnotes = this.footnotes.concat(parsedElements.footnotes); + this.containLinks = this.containLinks || parsedElements.containLinks; } } \ No newline at end of file diff --git a/src/javascript/models/TextItemLineCompactor.jsx b/src/javascript/models/TextItemLineCompactor.jsx index 74c69b8..68b0bd7 100644 --- a/src/javascript/models/TextItemLineCompactor.jsx +++ b/src/javascript/models/TextItemLineCompactor.jsx @@ -51,10 +51,6 @@ export default class TextItemLineCompactor { }); } combinedItem.parsedElements = parsedElements; - - //TODO whitespace removal - //TODO bold/emphasis - return combinedItem; } @@ -80,7 +76,7 @@ export default class TextItemLineCompactor { //TODO womb comp [29] => ydiff == 0 newLineItems.push(new TextItem({ ...stashedNumberItems[0], - text: `(^${ joinedNumber}):` + text: `(^${ joinedNumber}): ` })); footnotes.push(joinedNumber); } else { diff --git a/src/javascript/models/transformations/old/DetectLinks.jsx b/src/javascript/models/transformations/old/DetectLinks.jsx deleted file mode 100644 index a05b5cf..0000000 --- a/src/javascript/models/transformations/old/DetectLinks.jsx +++ /dev/null @@ -1,54 +0,0 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; -import TextItem from '../TextItem.jsx'; -import ParseResult from '../ParseResult.jsx'; - -import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; - -export default class DetectLinks extends ToPdfViewTransformation { - - constructor() { - super("Detect Links"); - } - - transform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - const newTextItems = []; - page.textItems.forEach(item => { - newTextItems.push(item); - var words = item.text.split(' '); - var changedWords = []; - var change = false; - words.forEach(word => { - if (word.startsWith('http:')) { - changedWords.push(`[${word}](${word})`); - change = true; - } else if (word.startsWith('www.')) { - changedWords.push(`[http://${word}](http://${word})`); - change = true; - } else { - changedWords.push(word); - } - }); - if (change) { - newTextItems.push(new TextItem({ - ...item, - text: changedWords.join(' '), - annotation: ADDED_ANNOTATION, - })); - item.annotation = REMOVED_ANNOTATION; - } - }); - page.textItems = newTextItems; - }); - return parseResult; - } - - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx b/src/javascript/models/transformations/old/RemoveWhitespaces.jsx deleted file mode 100644 index 8c91c3c..0000000 --- a/src/javascript/models/transformations/old/RemoveWhitespaces.jsx +++ /dev/null @@ -1,51 +0,0 @@ -import ToPdfViewTransformation from './ToPdfViewTransformation.jsx'; -import TextItem from '../TextItem.jsx'; -import ParseResult from '../ParseResult.jsx'; - -import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx'; - -export default class RemoveWhitespaces extends ToPdfViewTransformation { - - constructor() { - super("Remove Whitespaces"); - this.showWhitespaces = true; - } - - transform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - const newTextItems = []; - page.textItems.forEach(item => { - newTextItems.push(item); - var words = item.text.trim().split(' '); - var changedWords = []; - var change = false; - words.forEach(word => { - if (word.length == 0) { - change = true; - } else { - changedWords.push(word); - } - }); - if (change) { - newTextItems.push(new TextItem({ - ...item, - text: changedWords.join(' '), - annotation: ADDED_ANNOTATION, - })); - item.annotation = REMOVED_ANNOTATION; - } - }); - page.textItems = newTextItems; - }); - return parseResult; - } - - completeTransform(parseResult:ParseResult) { - parseResult.content.forEach(page => { - page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION); - page.textItems.forEach(textItem => textItem.annotation = null) - }); - return parseResult; - } - -} \ No newline at end of file diff --git a/src/javascript/models/transformations/textitem/CompactLines.jsx b/src/javascript/models/transformations/textitem/CompactLines.jsx index 3cc89ad..890e2ee 100644 --- a/src/javascript/models/transformations/textitem/CompactLines.jsx +++ b/src/javascript/models/transformations/textitem/CompactLines.jsx @@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; export default class CompactLines extends ToTextItemTransformation { constructor() { - super("Compact Lines"); + super("Compact To Lines"); } transform(parseResult:ParseResult) { @@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation { const newItems = []; const textItemsGroupedByLine = lineGrouper.group(page.items); textItemsGroupedByLine.forEach(textItemsOfLine => { + var lineItem; if (textItemsOfLine.length == 1) { - newItems.push(textItemsOfLine[0]); + lineItem = textItemsOfLine[0]; } else { textItemsOfLine.forEach(item => { item.annotation = REMOVED_ANNOTATION; newItems.push(item); }); - const combinedItem = lineCompactor.compact(textItemsOfLine); - combinedItem.annotation = ADDED_ANNOTATION; - newItems.push(combinedItem); + lineItem = lineCompactor.compact(textItemsOfLine); + lineItem.annotation = ADDED_ANNOTATION; - if (combinedItem.parsedElements.footnoteLinks.length > 0) { - const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },); + if (lineItem.parsedElements.footnoteLinks.length > 0) { + const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => { footnoteLink },); foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); } - if (combinedItem.parsedElements.footnotes.length > 0) { - combinedItem.type = ElementType.FOOTNOTES; - const footnotes = combinedItem.parsedElements.footnotes.map(footnote => { footnote },); + if (lineItem.parsedElements.footnotes.length > 0) { + lineItem.type = ElementType.FOOTNOTES; + const footnotes = lineItem.parsedElements.footnotes.map(footnote => { footnote },); foundFootnotes.push.apply(foundFootnotes, footnotes); } } + lineItem.text = lineItem.text.trim(); + newItems.push(lineItem); }); page.items = newItems; } diff --git a/src/javascript/models/transformations/textitem/PostprocessLines.jsx b/src/javascript/models/transformations/textitem/PostprocessLines.jsx new file mode 100644 index 0000000..4ec2c03 --- /dev/null +++ b/src/javascript/models/transformations/textitem/PostprocessLines.jsx @@ -0,0 +1,75 @@ +import ToTextItemTransformation from '../ToTextItemTransformation.jsx'; +import ParseResult from '../../ParseResult.jsx'; +import TextItem from '../../TextItem.jsx'; +import { ParsedElements } from '../../PageItem.jsx'; +import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx'; + + +// Remove whitespace, detect links, etc... +export default class PostprocessLines extends ToTextItemTransformation { + + constructor() { + super("Remove Whitespace & Detect Links"); + this.showWhitespaces = true; + } + + transform(parseResult:ParseResult) { + var strippedWhitespace = 0; + var foundLinks = 0; + + parseResult.pages.forEach(page => { + const newItems = []; + page.items.forEach(lineItem => { + newItems.push(lineItem); + var words = lineItem.text.split(' '); + var newWords = []; + var foundSuperflousNewLine = false; + var foundLink = false; + words.forEach(word => { + if (word.trim().length == 0) { + foundSuperflousNewLine = true; + strippedWhitespace++; + } else { + if (word.startsWith('http:')) { + foundLinks++; + foundLink = true; + newWords.push(`[${word}](${word})`); + } else if (word.startsWith('www.')) { + foundLinks++; + foundLink = true; + newWords.push(`[http://${word}](http://${word})`); + } else { + newWords.push(word); + } + } + }); + if (foundSuperflousNewLine || foundLink) { + lineItem.annotation = REMOVED_ANNOTATION; + if (newWords.length > 0) { + newItems.push(new TextItem({ + ...lineItem, + text: newWords.join(' '), + annotation: ADDED_ANNOTATION, + parsedElements: new ParsedElements({ + ...lineItem.parsedElements, + containLinks: foundLink + }) + })); + } + } + }); + page.items = newItems; + }); + + + return new ParseResult({ + ...parseResult, + messages: [ + 'Stripped ' + strippedWhitespace + ' superflous whitespaces', + 'Found ' + foundLinks + ' links', + ] + }); + } + + +}