[WIP] Add remove whitespace and detect links again

This commit is contained in:
Johannes Zillmann 2017-03-18 08:56:08 +01:00
parent 4600dc6ee7
commit 07e7fbb505
8 changed files with 95 additions and 131 deletions

View File

@ -54,7 +54,7 @@ export default class TextItemTable extends React.Component {
</div> </div>
<div style={ { textAlign: 'center', color: 'orange' } }> <div style={ { textAlign: 'center', color: 'orange' } }>
{ textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' } { textItem.parsedElements && textItem.parsedElements.footnoteLinks.length > 0 ? 'Footnote-Link' : '' }
{ textItem.parsedElements && textItem.parsedElements.footnotes.length > 0 ? 'Footnote' : '' } { textItem.parsedElements && textItem.parsedElements.containLinks ? 'Link' : '' }
</div> </div>
</td> </td>
<td> <td>

View File

@ -4,6 +4,7 @@ import CalculateGlobalStats from './transformations/textitem/CalculateGlobalStat
import CompactLines from './transformations/textitem/CompactLines.jsx'; import CompactLines from './transformations/textitem/CompactLines.jsx';
import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx' import RemoveRepetitiveElements from './transformations/textitem/RemoveRepetitiveElements.jsx'
import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx'; import VerticalToHorizontal from './transformations/textitem/VerticalToHorizontal.jsx';
import PostprocessLines from './transformations/textitem/PostprocessLines.jsx';
import DetectTOC from './transformations/textitem/DetectTOC.jsx' import DetectTOC from './transformations/textitem/DetectTOC.jsx'
import DetectListItems from './transformations/textitem/DetectListItems.jsx' import DetectListItems from './transformations/textitem/DetectListItems.jsx'
import DetectHeaders from './transformations/textitem/DetectHeaders.jsx' import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
@ -11,11 +12,7 @@ import DetectHeaders from './transformations/textitem/DetectHeaders.jsx'
import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx' import GatherBlocks from './transformations/textitemblock/GatherBlocks.jsx'
import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx' import DetectCodeQuoteBlocks from './transformations/textitemblock/DetectCodeQuoteBlocks.jsx'
import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx' import DetectListLevels from './transformations/textitemblock/DetectListLevels.jsx'
// import DetectHeadlines from './transformations/textitemblock/DetectHeadlines.jsx'
// import DetectFormats from './transformations/DetectFormats.jsx' // import DetectFormats from './transformations/DetectFormats.jsx'
// import RemoveWhitespaces from './transformations/RemoveWhitespaces.jsx'
// import DetectLinks from './transformations/DetectLinks.jsx'
// import HeadlineDetector from './transformations/HeadlineDetector.jsx'
// import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx' // import HeadlineToUppercase from './transformations/HeadlineToUppercase.jsx'
import ToTextBlocks from './transformations/ToTextBlocks.jsx'; import ToTextBlocks from './transformations/ToTextBlocks.jsx';
import ToMarkdown from './transformations/ToMarkdown.jsx' import ToMarkdown from './transformations/ToMarkdown.jsx'
@ -33,6 +30,7 @@ export default class AppState {
new CompactLines(), new CompactLines(),
new RemoveRepetitiveElements(), new RemoveRepetitiveElements(),
new VerticalToHorizontal(), new VerticalToHorizontal(),
new PostprocessLines(),
new DetectTOC(), new DetectTOC(),
new DetectListItems(), new DetectListItems(),
new DetectHeaders(), new DetectHeaders(),
@ -40,12 +38,8 @@ export default class AppState {
new GatherBlocks(), new GatherBlocks(),
new DetectCodeQuoteBlocks(), new DetectCodeQuoteBlocks(),
new DetectListLevels(), new DetectListLevels(),
// new DetectHeadlines(),
// new DetectFormats(), // new DetectFormats(),
// new RemoveWhitespaces(),
// new DetectLinks(),
// new HeadlineDetector(),
// new HeadlineToUppercase(), // new HeadlineToUppercase(),
new ToTextBlocks(), new ToTextBlocks(),
new ToMarkdown()]; new ToMarkdown()];

View File

@ -15,13 +15,15 @@ export default class PageItem {
export class ParsedElements { export class ParsedElements {
constructor(options) { constructor(options) {
this.footnoteLinks = options.footnoteLinks; this.footnoteLinks = options.footnoteLinks || [];
this.footnotes = options.footnotes; this.footnotes = options.footnotes || [];
this.containLinks = options.containLinks;
} }
add(parsedElements) { add(parsedElements) {
this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks); this.footnoteLinks = this.footnoteLinks.concat(parsedElements.footnoteLinks);
this.footnotes = this.footnotes.concat(parsedElements.footnotes); this.footnotes = this.footnotes.concat(parsedElements.footnotes);
this.containLinks = this.containLinks || parsedElements.containLinks;
} }
} }

View File

@ -51,10 +51,6 @@ export default class TextItemLineCompactor {
}); });
} }
combinedItem.parsedElements = parsedElements; combinedItem.parsedElements = parsedElements;
//TODO whitespace removal
//TODO bold/emphasis
return combinedItem; return combinedItem;
} }

View File

@ -1,54 +0,0 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
export default class DetectLinks extends ToPdfViewTransformation {
constructor() {
super("Detect Links");
}
transform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.startsWith('http:')) {
changedWords.push(`[${word}](${word})`);
change = true;
} else if (word.startsWith('www.')) {
changedWords.push(`[http://${word}](http://${word})`);
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: ADDED_ANNOTATION,
}));
item.annotation = REMOVED_ANNOTATION;
}
});
page.textItems = newTextItems;
});
return parseResult;
}
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}

View File

@ -1,51 +0,0 @@
import ToPdfViewTransformation from './ToPdfViewTransformation.jsx';
import TextItem from '../TextItem.jsx';
import ParseResult from '../ParseResult.jsx';
import { ADDED_ANNOTATION, REMOVED_ANNOTATION } from '../Annotation.jsx';
export default class RemoveWhitespaces extends ToPdfViewTransformation {
constructor() {
super("Remove Whitespaces");
this.showWhitespaces = true;
}
transform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
const newTextItems = [];
page.textItems.forEach(item => {
newTextItems.push(item);
var words = item.text.trim().split(' ');
var changedWords = [];
var change = false;
words.forEach(word => {
if (word.length == 0) {
change = true;
} else {
changedWords.push(word);
}
});
if (change) {
newTextItems.push(new TextItem({
...item,
text: changedWords.join(' '),
annotation: ADDED_ANNOTATION,
}));
item.annotation = REMOVED_ANNOTATION;
}
});
page.textItems = newTextItems;
});
return parseResult;
}
completeTransform(parseResult:ParseResult) {
parseResult.content.forEach(page => {
page.textItems = page.textItems.filter(textItem => !textItem.annotation || textItem.annotation !== REMOVED_ANNOTATION);
page.textItems.forEach(textItem => textItem.annotation = null)
});
return parseResult;
}
}

View File

@ -12,7 +12,7 @@ import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
export default class CompactLines extends ToTextItemTransformation { export default class CompactLines extends ToTextItemTransformation {
constructor() { constructor() {
super("Compact Lines"); super("Compact To Lines");
} }
transform(parseResult:ParseResult) { transform(parseResult:ParseResult) {
@ -29,28 +29,30 @@ export default class CompactLines extends ToTextItemTransformation {
const newItems = []; const newItems = [];
const textItemsGroupedByLine = lineGrouper.group(page.items); const textItemsGroupedByLine = lineGrouper.group(page.items);
textItemsGroupedByLine.forEach(textItemsOfLine => { textItemsGroupedByLine.forEach(textItemsOfLine => {
var lineItem;
if (textItemsOfLine.length == 1) { if (textItemsOfLine.length == 1) {
newItems.push(textItemsOfLine[0]); lineItem = textItemsOfLine[0];
} else { } else {
textItemsOfLine.forEach(item => { textItemsOfLine.forEach(item => {
item.annotation = REMOVED_ANNOTATION; item.annotation = REMOVED_ANNOTATION;
newItems.push(item); newItems.push(item);
}); });
const combinedItem = lineCompactor.compact(textItemsOfLine); lineItem = lineCompactor.compact(textItemsOfLine);
combinedItem.annotation = ADDED_ANNOTATION; lineItem.annotation = ADDED_ANNOTATION;
newItems.push(combinedItem);
if (combinedItem.parsedElements.footnoteLinks.length > 0) { if (lineItem.parsedElements.footnoteLinks.length > 0) {
const footnoteLinks = combinedItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>); const footnoteLinks = lineItem.parsedElements.footnoteLinks.map(footnoteLink => <span key={ footnoteLink }><a href={ "#Page " + (page.index + 1) }>{ footnoteLink }</a>,</span>);
foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks); foundFootnoteLinks.push.apply(foundFootnoteLinks, footnoteLinks);
} }
if (combinedItem.parsedElements.footnotes.length > 0) { if (lineItem.parsedElements.footnotes.length > 0) {
combinedItem.type = ElementType.FOOTNOTES; lineItem.type = ElementType.FOOTNOTES;
const footnotes = combinedItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>); const footnotes = lineItem.parsedElements.footnotes.map(footnote => <span key={ footnote }><a href={ "#Page " + (page.index + 1) }>{ footnote }</a>,</span>);
foundFootnotes.push.apply(foundFootnotes, footnotes); foundFootnotes.push.apply(foundFootnotes, footnotes);
} }
} }
lineItem.text = lineItem.text.trim();
newItems.push(lineItem);
}); });
page.items = newItems; page.items = newItems;
} }

View File

@ -0,0 +1,75 @@
import ToTextItemTransformation from '../ToTextItemTransformation.jsx';
import ParseResult from '../../ParseResult.jsx';
import TextItem from '../../TextItem.jsx';
import { ParsedElements } from '../../PageItem.jsx';
import { REMOVED_ANNOTATION, ADDED_ANNOTATION } from '../../Annotation.jsx';
// Remove whitespace, detect links, etc...
export default class PostprocessLines extends ToTextItemTransformation {
constructor() {
super("Remove Whitespace & Detect Links");
this.showWhitespaces = true;
}
transform(parseResult:ParseResult) {
var strippedWhitespace = 0;
var foundLinks = 0;
parseResult.pages.forEach(page => {
const newItems = [];
page.items.forEach(lineItem => {
newItems.push(lineItem);
var words = lineItem.text.split(' ');
var newWords = [];
var foundSuperflousNewLine = false;
var foundLink = false;
words.forEach(word => {
if (word.trim().length == 0) {
foundSuperflousNewLine = true;
strippedWhitespace++;
} else {
if (word.startsWith('http:')) {
foundLinks++;
foundLink = true;
newWords.push(`[${word}](${word})`);
} else if (word.startsWith('www.')) {
foundLinks++;
foundLink = true;
newWords.push(`[http://${word}](http://${word})`);
} else {
newWords.push(word);
}
}
});
if (foundSuperflousNewLine || foundLink) {
lineItem.annotation = REMOVED_ANNOTATION;
if (newWords.length > 0) {
newItems.push(new TextItem({
...lineItem,
text: newWords.join(' '),
annotation: ADDED_ANNOTATION,
parsedElements: new ParsedElements({
...lineItem.parsedElements,
containLinks: foundLink
})
}));
}
}
});
page.items = newItems;
});
return new ParseResult({
...parseResult,
messages: [
'Stripped ' + strippedWhitespace + ' superflous whitespaces',
'Found ' + foundLinks + ' links',
]
});
}
}